# Heart Disease Dataset
# This data set dates from 1988 and consists of four databases: Cleveland, Hungary,
# Switzerland, and Long Beach V. It contains 76 attributes, including the predicted
# attribute, but all published experiments refer to using a subset of 14 of them.
# The "target" field refers to the presence of heart disease in the patient.
# It is integer valued 0 = no disease and 1 = disease.
# This is multivariate type of dataset which means providing or involving a variety of
# separate mathematical or statistical variables, multivariate numerical data analysis.
# It is composed of 14 attributes.
#
# One of the major tasks on this dataset is to predict based on the given attributes of a
# patient that whether that particular person has a heart disease or not and other is the
# experimental task to diagnose and find out various insights from this dataset which could
# help in understanding the problem more.
# There are various factors associated in the process of determining whether a person will
# have a heart disease or not. In this project, we will do the analysis on some hypothesis and
# will come up with some conclusions for the same.
# Dataset columns:
# age: The person’s age in years
# sex: The person’s sex (1 = male, 0 = female)
# cp: chest pain type
# — Value 0: asymptomatic
# — Value 1: atypical angina
# — Value 2: non-anginal pain
# — Value 3: typical angina
# trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)
# chol: The person’s cholesterol measurement in mg/dl
# fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
# restecg: resting electrocardiographic results
# Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# Value 1: normal
# Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# thalach: The person’s maximum heart rate achieved
# exang: Exercise induced angina (1 = yes; 0 = no)
# oldpeak: ST depression induced by exercise relative to rest (‘ST’ relates to positions on the ECG plot. See more here)
# slope: the slope of the peak exercise ST segment — 0: downsloping; 1: flat; 2: upsloping
# 0: downsloping; 1: flat; 2: upsloping
# ca: The number of major vessels (0–3)
# thal: A blood disorder called thalassemia Value 0: NULL (dropped from the dataset previously
# Value 1: fixed defect (no blood flow in some part of the heart)
# Value 2: normal blood flow
# Value 3: reversible defect (a blood flow is observed but it is not normal)
# target: Heart disease (1 = no, 0= yes)
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(forcats)
library(rsample)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.2.1 ✔ purrr 1.0.1
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.1 ✔ recipes 1.0.5
## ✔ dials 1.2.0 ✔ tune 1.1.1
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.1.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.0 ✔ yardstick 1.1.0
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Dig deeper into tidy modeling with R at https://www.tmwr.org
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(readr)
library(corrplot)
## corrplot 0.92 loaded
heart <- read_csv("/Users/sarju/Desktop/MITA Sem 2/MVA/Individual_Project/heart.csv")
## Rows: 1025 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpea...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heart
## # A tibble: 1,025 × 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212 0 1 168 0 1 2
## 2 53 1 0 140 203 1 0 155 1 3.1 0
## 3 70 1 0 145 174 0 1 125 1 2.6 0
## 4 61 1 0 148 203 0 1 161 0 0 2
## 5 62 0 0 138 294 1 1 106 0 1.9 1
## 6 58 0 0 100 248 0 0 122 0 1 1
## 7 58 1 0 114 318 0 2 140 0 4.4 0
## 8 55 1 0 160 289 0 0 145 1 0.8 1
## 9 46 1 0 120 249 0 0 144 0 0.8 2
## 10 54 1 0 122 286 0 0 116 1 3.2 1
## # … with 1,015 more rows, and 3 more variables: ca <dbl>, thal <dbl>,
## # target <dbl>
dim(heart) # 1025 rows & 14 columns
## [1] 1025 14
colnames(heart)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
str(heart)
## spc_tbl_ [1,025 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : num [1:1025] 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : num [1:1025] 1 0 1 1 1 0 2 0 0 0 ...
## $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : num [1:1025] 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : num [1:1025] 2 0 0 2 1 1 0 1 2 1 ...
## $ ca : num [1:1025] 2 0 0 1 3 0 3 1 0 2 ...
## $ thal : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
## $ target : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(heart)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
# str function says that target column is a num type... so will make it a factor of 2 groups
# heart$target <- as.factor(heart$target)
# also converting cp column into factor of 4 groups
# heart$cp <- as.factor(heart$cp)
#
# heart$sex <- as.character(heart$sex)
str(heart)
## spc_tbl_ [1,025 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : num [1:1025] 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : num [1:1025] 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : num [1:1025] 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : num [1:1025] 1 0 1 1 1 0 2 0 0 0 ...
## $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : num [1:1025] 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : num [1:1025] 2 0 0 2 1 1 0 1 2 1 ...
## $ ca : num [1:1025] 2 0 0 1 3 0 3 1 0 2 ...
## $ thal : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
## $ target : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
#
# heart[heart$sex==1, 'sex'] <- "male"
# heart[heart$sex==0, 'sex'] <- "female"
# heart$sex
heart_x = subset(heart, select = c(age,cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal))
heart_x # heart_x contains all the columns except the target column...
## # A tibble: 1,025 × 12
## age cp trestbps chol fbs restecg thalach exang oldpeak slope ca
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 0 125 212 0 1 168 0 1 2 2
## 2 53 0 140 203 1 0 155 1 3.1 0 0
## 3 70 0 145 174 0 1 125 1 2.6 0 0
## 4 61 0 148 203 0 1 161 0 0 2 1
## 5 62 0 138 294 1 1 106 0 1.9 1 3
## 6 58 0 100 248 0 0 122 0 1 1 0
## 7 58 0 114 318 0 2 140 0 4.4 0 3
## 8 55 0 160 289 0 0 145 1 0.8 1 1
## 9 46 0 120 249 0 0 144 0 0.8 2 0
## 10 54 0 122 286 0 0 116 1 3.2 1 2
## # … with 1,015 more rows, and 1 more variable: thal <dbl>
heart_y = heart$target
heart_y
## [1] 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 1 0 1
## [38] 1 1 0 1 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0
## [75] 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 0 0 0
## [112] 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0
## [149] 1 1 0 0 0 1 0 1 1 1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 1 1
## [186] 0 0 0 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 1 1 1 0 1 0 0 0 0
## [223] 1 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0
## [260] 1 1 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 0 0
## [297] 0 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1
## [334] 1 0 0 1 1 0 0 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 1 1
## [371] 0 0 1 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 1
## [408] 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 1
## [445] 1 1 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 1 0 0
## [482] 0 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1
## [519] 0 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 1 0
## [556] 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 0 1 1 1 0 0 1 0 0 0 0 1 0
## [593] 0 0 0 0 1 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1
## [630] 0 0 1 1 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 1 1 1 1
## [667] 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 0 1
## [704] 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 1 1 0 1 1 0 1 0 0 0 0
## [741] 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0
## [778] 0 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 1 0 1 1 1 0 0 0 0
## [815] 1 1 1 1 1 0 0 0 0 1 0 1 1 0 1 0 1 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 0 1 0 0
## [852] 1 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1
## [889] 0 0 0 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0
## [926] 0 0 1 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 0 1 1 1 1
## [963] 1 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0
## [1000] 0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0
heart_cm <- colMeans(heart_x)
heart_cm
## age cp trestbps chol fbs restecg
## 54.4341463 0.9424390 131.6117073 246.0000000 0.1492683 0.5297561
## thalach exang oldpeak slope ca thal
## 149.1141463 0.3365854 1.0715122 1.3853659 0.7541463 2.3239024
heart_S <- cov(heart_x)
heart_S
## age cp trestbps chol fbs
## age 82.3064501 -0.67225133 43.0857327 102.8906250 0.392163681
## cp -0.6722513 1.06016006 0.6885652 -4.3369141 0.029108232
## trestbps 43.0857327 0.68856517 306.8354097 115.6572266 1.135164825
## chol 102.8906250 -4.33691406 115.6572266 2661.7871094 0.495117188
## fbs 0.3921637 0.02910823 1.1351648 0.4951172 0.127111280
## restecg -0.6354897 0.02368712 -1.1446846 -4.0146484 -0.019582698
## thalach -81.4460890 7.26829554 -15.8228220 -25.8417969 -0.072719131
## exang 0.3781441 -0.19545065 0.5067978 1.6435547 0.008303163
## oldpeak 2.2188253 -0.21140701 3.8579706 3.9333008 0.004549447
## slope -0.9477420 0.08372713 -1.3033441 -0.4541016 -0.013633765
## ca 2.5394579 -0.18701696 1.8878420 3.9492188 0.050405869
## thal 0.4070932 -0.10438453 0.6444465 3.2099609 -0.009333079
## restecg thalach exang oldpeak slope
## age -0.635489710 -81.44608899 0.378144055 2.218825267 -0.94774200
## cp 0.023687119 7.26829554 -0.195450648 -0.211407012 0.08372713
## trestbps -1.144684642 -15.82282203 0.506797828 3.857970560 -1.30334413
## chol -4.014648438 -25.84179688 1.643554688 3.933300781 -0.45410156
## fbs -0.019582698 -0.07271913 0.008303163 0.004549447 -0.01363377
## restecg 0.278654726 0.58790873 -0.016372904 -0.031085080 0.02807260
## thalach 0.587908727 529.26332508 -4.136113758 -9.456022389 5.61807832
## exang -0.016372904 -4.13611376 0.223513720 0.172683880 -0.07807736
## oldpeak -0.031085080 -9.45602239 0.172683880 1.380750152 -0.41752668
## slope 0.028072599 5.61807832 -0.078077363 -0.417526677 0.38162157
## ca -0.042481898 -4.92991711 0.052558117 0.268672923 -0.04676543
## thal -0.006717797 -1.40028963 0.057864901 0.147810499 -0.03607565
## ca thal
## age 2.53945789 0.407093178
## cp -0.18701696 -0.104384527
## trestbps 1.88784204 0.644446456
## chol 3.94921875 3.209960938
## fbs 0.05040587 -0.009333079
## restecg -0.04248190 -0.006717797
## thalach -4.92991711 -1.400289634
## exang 0.05255812 0.057864901
## oldpeak 0.26867292 0.147810499
## slope -0.04676543 -0.036075648
## ca 1.06254383 0.095335366
## thal 0.09533537 0.385219131
d <- apply(heart_x, MARGIN = 1, function(heart_x)t(heart_x - heart_cm) %*% solve(heart_S) %*% (heart_x - heart_cm)) # taking the distance... (distance formula)
d
## [1] 7.567230 18.324510 15.885833 9.366377 18.787735 10.011945 38.960972
## [8] 8.675187 8.844229 9.931920 15.991847 21.134232 7.715376 15.765682
## [15] 27.869344 7.715376 9.132464 6.522142 3.684812 10.241469 10.425369
## [22] 10.594284 12.479870 6.171794 6.495000 7.624908 10.202440 11.374880
## [29] 16.229769 29.556340 18.145348 3.684812 10.851811 13.648647 3.416521
## [36] 12.832129 18.384857 8.411755 10.523994 6.859389 12.145491 7.289091
## [43] 8.902688 8.844229 7.967498 7.196298 12.799737 21.806891 8.163048
## [50] 16.860813 13.259467 9.071561 24.410827 13.200681 19.721716 19.721716
## [57] 11.708776 6.364545 10.263652 5.696189 12.567033 8.163048 13.117883
## [64] 13.513219 12.567033 15.464898 16.158613 17.385257 5.334974 34.149741
## [71] 14.786531 9.576794 9.954783 11.934555 14.457960 5.261782 11.923134
## [78] 21.414340 3.303600 3.303600 6.613851 18.350602 12.832129 24.410827
## [85] 5.334974 3.083185 10.241469 16.346673 15.758809 19.593698 6.000657
## [92] 7.796978 21.414340 18.145348 12.685021 8.520165 5.767077 10.739928
## [99] 14.890842 20.806033 5.423973 18.197598 7.796058 5.261782 16.220477
## [106] 8.804543 10.528573 10.378807 11.689726 11.227858 8.276915 13.390193
## [113] 14.457960 11.255072 15.266901 7.624908 6.123407 11.556829 12.567033
## [120] 5.603233 7.796058 18.145348 6.171322 19.868914 9.499031 8.478883
## [127] 9.746636 16.158613 18.245952 11.378978 8.478883 4.837180 12.799737
## [134] 3.416521 7.796058 20.865359 7.967498 20.093110 5.261782 4.893574
## [141] 13.562466 10.716731 10.643449 11.855798 6.082881 14.074218 5.732385
## [148] 11.327498 16.634908 9.987618 38.960972 21.131070 8.903341 7.289091
## [155] 17.164473 7.796058 17.905638 7.289091 47.534084 6.194887 22.692537
## [162] 10.500501 22.692537 8.300828 9.954783 14.786531 13.875012 12.737300
## [169] 10.923991 7.741729 10.083886 18.312262 17.571000 7.766130 6.933679
## [176] 25.882587 18.312262 10.742021 8.276915 14.399568 21.414340 17.978966
## [183] 9.181265 6.126505 5.783882 19.540936 14.074218 9.954783 6.123407
## [190] 11.399358 4.916928 5.999869 47.534084 15.882222 12.865842 13.513219
## [197] 8.870408 7.677090 9.753318 8.730911 7.796978 11.855798 16.634908
## [204] 15.158247 8.163048 13.711653 15.266901 4.916928 24.410827 10.378807
## [211] 26.925677 13.117883 11.568543 10.923991 5.783882 3.253317 8.294272
## [218] 6.194887 11.189485 9.931920 7.210865 6.123407 17.978966 7.766130
## [225] 11.448772 8.746441 18.350602 6.681593 11.661816 21.806891 8.903341
## [232] 5.084182 6.075646 6.634512 5.033111 15.158247 18.350602 5.705230
## [239] 16.860813 8.838920 11.982901 10.263652 24.410827 10.425369 12.954323
## [246] 5.627663 21.131070 11.189485 8.717593 6.126505 8.804543 7.741729
## [253] 9.208650 19.540936 10.754012 13.668936 12.214204 20.093110 24.560255
## [260] 9.746636 3.416345 8.746441 7.631827 7.967498 20.865359 12.479870
## [267] 10.739928 20.392370 16.297574 17.196970 7.543155 5.322923 8.717593
## [274] 11.227858 20.685614 18.312262 10.683295 5.627663 8.675187 5.448627
## [281] 5.467871 5.998285 7.707600 17.978966 16.297574 17.196970 10.715211
## [288] 11.716112 8.433534 11.689726 18.245952 6.213733 13.318870 13.513219
## [295] 25.882587 13.117883 20.392370 12.452186 5.998285 6.064092 4.453693
## [302] 14.523380 7.967498 6.171322 6.646520 8.439129 6.681593 3.083185
## [309] 15.266901 4.240356 9.499031 14.457960 13.648647 13.858305 17.905638
## [316] 13.711653 10.715211 6.171794 11.934555 23.360333 5.951457 6.432516
## [323] 19.016641 25.353496 6.364545 4.240356 21.131070 14.328320 13.736614
## [330] 23.360333 5.334974 15.429358 19.754734 5.504205 6.541213 12.452186
## [337] 18.197598 14.207833 16.229769 6.763125 24.410827 3.416345 7.837793
## [344] 18.016146 6.194887 25.353496 11.568543 6.000657 26.631753 13.137499
## [351] 9.746636 8.730911 11.255072 12.205877 5.705230 8.939781 28.839256
## [358] 12.865842 9.622106 23.360333 7.631827 10.686158 5.423973 14.316413
## [365] 7.210865 14.316413 10.100743 15.488123 13.747349 9.731331 21.134232
## [372] 9.208650 5.504205 5.730386 16.984642 20.685614 9.368690 8.442907
## [379] 20.392370 16.220477 10.806579 16.297574 7.232464 12.452186 12.723347
## [386] 9.731331 6.495000 22.692537 10.742021 20.441713 11.374880 13.748795
## [393] 9.731331 34.149741 17.164473 8.163048 18.637960 14.074218 20.685614
## [400] 20.441713 13.200681 15.991847 9.769501 4.453693 9.576794 16.229769
## [407] 10.241469 8.870408 8.939781 12.832129 5.448627 9.954783 14.328320
## [414] 13.736614 15.464898 7.707600 14.207833 18.245952 7.796978 12.145491
## [421] 5.767077 12.924752 12.737300 11.374880 19.016641 6.780655 7.757639
## [428] 9.933930 26.631753 4.821205 10.263652 13.485158 9.753318 19.754734
## [435] 12.441599 12.737300 16.346673 8.371008 3.553975 5.730386 8.838920
## [442] 9.181265 13.562466 9.933930 3.553975 11.661816 14.316413 8.675187
## [449] 5.732385 4.844452 22.931051 5.619202 21.806891 5.619202 13.485158
## [456] 15.882222 4.821205 8.717593 10.083886 11.448772 11.419118 11.923134
## [463] 16.220477 7.543155 47.534084 24.410827 5.627663 8.804543 13.318870
## [470] 12.865842 8.478883 10.715211 6.784111 7.416307 15.488123 18.197598
## [477] 19.540936 6.859389 9.987618 8.861752 6.780655 22.931051 15.765682
## [484] 5.998285 14.890842 10.378807 11.327498 8.294272 11.982901 13.318870
## [491] 5.767077 9.933930 11.399358 9.208650 12.954323 8.620682 18.637960
## [498] 5.696189 6.317307 16.984642 15.991847 12.214204 16.984642 5.467871
## [505] 4.821205 6.681593 9.499031 7.707600 25.882587 29.556340 11.982901
## [512] 11.556829 7.978755 11.227858 3.083185 13.200681 6.541213 5.448627
## [519] 13.200681 9.366377 11.934555 17.885162 8.661545 9.366377 10.100743
## [526] 4.837180 34.149741 7.796978 22.972734 15.882222 8.498527 7.837793
## [533] 10.716731 5.809265 6.000657 20.806033 3.969666 6.064092 8.844229
## [540] 15.158247 6.213733 5.440476 13.668936 9.622106 14.207833 15.488123
## [547] 13.736614 10.594284 5.732385 9.369839 11.419118 9.931920 21.134232
## [554] 16.158613 8.870408 19.003499 11.934555 7.631827 7.766130 20.392370
## [561] 6.784111 7.837793 6.495000 7.978755 13.875012 5.809265 6.261242
## [568] 7.416307 7.757639 26.925677 11.173832 6.171322 11.855798 7.978755
## [575] 6.075646 10.083886 16.634908 13.736614 13.668936 10.806579 16.798833
## [582] 5.440476 3.022686 11.556829 9.208650 17.905638 11.399358 28.839256
## [589] 7.624908 9.931920 13.858305 15.429358 13.648647 15.429358 12.973251
## [596] 9.366377 9.368690 24.410827 5.730386 10.089017 13.137499 11.189485
## [603] 8.433534 10.089017 3.553975 17.196970 11.423574 17.385257 12.973251
## [610] 29.556340 21.134232 11.780865 20.865359 19.721716 6.780655 3.969666
## [617] 10.606454 6.634512 3.253317 14.890842 13.562466 14.457960 18.787735
## [624] 10.643449 22.972734 11.419118 16.297574 24.560255 13.164780 13.369841
## [631] 13.748795 3.253317 5.467871 12.466745 7.567230 5.951457 16.346673
## [638] 8.433534 13.369841 6.784111 8.939781 14.399568 10.523994 11.944947
## [645] 3.416345 11.568543 4.821205 8.442907 15.991847 5.467871 8.439129
## [652] 6.194887 16.380300 18.312262 6.634512 4.893574 7.210865 8.717593
## [659] 11.399358 8.411755 12.466745 38.960972 6.082881 10.011945 16.380300
## [666] 19.868914 5.998285 5.705230 12.567033 25.353496 8.902688 7.567230
## [673] 9.369839 7.289091 17.164473 8.870408 6.763125 6.123407 12.441599
## [680] 18.637960 5.603233 14.786531 28.839256 11.556829 10.425369 22.931051
## [687] 27.869344 8.903341 25.882587 11.173832 13.747349 3.696858 18.324510
## [694] 10.089017 9.574540 12.723347 3.969666 10.263652 11.423574 12.723347
## [701] 7.416307 10.754012 11.716112 12.205877 14.523380 8.861752 6.859389
## [708] 4.240356 24.199980 13.748795 5.999869 10.754012 7.741729 16.380300
## [715] 18.384857 9.769501 11.780865 16.229769 3.696858 19.110606 8.587918
## [722] 7.677090 8.661545 10.552663 13.858305 8.498527 8.300828 5.999869
## [729] 11.189485 3.696858 3.083185 4.844452 14.523380 9.018784 27.869344
## [736] 3.416521 7.232464 8.371008 8.903341 7.370914 5.440476 12.441599
## [743] 17.164473 17.885162 9.753318 10.500501 8.587918 16.860813 3.416345
## [750] 17.885162 3.022686 12.145491 6.364545 11.944947 4.837180 11.923134
## [757] 10.742021 20.685614 4.453693 8.902688 14.328320 5.033111 5.999869
## [764] 13.259467 21.414340 11.255072 11.255072 19.593698 16.798833 17.196970
## [771] 12.799737 8.520165 8.838920 13.485158 6.432516 10.500501 7.624908
## [778] 10.739928 9.622106 7.715376 18.145348 10.806579 8.442907 17.571000
## [785] 8.746441 8.675187 19.003499 15.765682 18.787735 10.378807 6.933679
## [792] 6.933679 19.593698 24.199980 10.643449 12.685021 12.799737 13.485158
## [799] 15.464898 10.606454 8.371008 20.441713 10.716731 4.844452 6.784111
## [806] 13.390193 14.328320 10.202440 14.523380 5.504205 14.074218 5.705230
## [813] 15.758809 18.324510 12.685021 13.747349 9.769501 7.677090 13.711653
## [820] 20.865359 9.576794 15.758809 6.763125 6.317307 12.466745 6.171794
## [827] 6.126505 6.859389 10.202440 6.522142 10.500501 17.885162 9.369839
## [834] 19.721716 17.385257 18.350602 5.809265 18.016146 18.384857 11.448772
## [841] 15.885833 12.214204 10.100743 15.292133 9.181265 13.875012 12.214204
## [848] 12.466745 13.259467 7.370914 5.730386 19.754734 7.370914 8.371008
## [855] 13.369841 16.798833 10.552663 10.606454 10.552663 10.528573 4.844452
## [862] 8.587918 15.464898 7.567230 15.292133 8.478883 4.916928 4.893574
## [869] 17.571000 5.084182 6.432516 9.987618 15.158247 11.378978 15.292133
## [876] 6.541213 9.576794 5.423973 9.191463 8.411755 10.683295 5.696189
## [883] 10.851811 8.300828 15.885833 19.540936 9.499031 12.205877 8.498527
## [890] 22.931051 11.780865 20.093110 6.933679 27.869344 10.528573 13.137499
## [897] 22.972734 3.303600 12.924752 8.620682 10.643449 12.924752 15.758809
## [904] 15.266901 3.022686 10.742021 7.196298 3.684812 8.439129 8.730911
## [911] 6.613851 11.374880 10.754012 12.479870 11.423574 8.844229 8.294272
## [918] 3.553975 9.622106 24.560255 9.574540 10.806579 5.322923 6.317307
## [925] 8.902688 10.851811 11.227858 26.925677 6.522142 6.541213 8.294272
## [932] 11.689726 9.132464 24.560255 6.126505 17.571000 10.923991 8.587918
## [939] 13.390193 5.619202 9.071561 6.646520 13.164780 11.944947 21.806891
## [946] 11.708776 8.661545 7.757639 15.885833 10.683295 12.865842 13.137499
## [953] 11.173832 8.520165 5.951457 10.686158 5.033111 6.613851 19.868914
## [960] 13.164780 6.646520 10.011945 19.110606 9.071561 9.018784 20.806033
## [967] 6.213733 24.199980 18.324510 6.064092 24.410827 18.016146 16.220477
## [974] 12.954323 7.543155 9.574540 15.429358 11.661816 8.276915 19.016641
## [981] 11.378978 9.574540 10.594284 10.523994 8.620682 10.686158 29.556340
## [988] 5.696189 9.181265 11.716112 5.084182 16.860813 6.261242 26.631753
## [995] 7.232464 5.322923 14.399568 9.191463 17.385257 19.003499 12.973251
## [1002] 7.196298 11.423574 19.110606 9.132464 11.780865 10.241469 11.708776
## [1009] 5.603233 11.689726 10.528573 5.783882 15.488123 38.960972 9.018784
## [1016] 8.861752 13.369841 10.739928 11.327498 6.082881 9.368690 6.075646
## [1023] 8.804543 6.261242 9.191463
# mahalanobis
library(stats)
heart_MD <- mahalanobis(heart_x, heart_cm, heart_S)
heart_MD
## [1] 7.567230 18.324510 15.885833 9.366377 18.787735 10.011945 38.960972
## [8] 8.675187 8.844229 9.931920 15.991847 21.134232 7.715376 15.765682
## [15] 27.869344 7.715376 9.132464 6.522142 3.684812 10.241469 10.425369
## [22] 10.594284 12.479870 6.171794 6.495000 7.624908 10.202440 11.374880
## [29] 16.229769 29.556340 18.145348 3.684812 10.851811 13.648647 3.416521
## [36] 12.832129 18.384857 8.411755 10.523994 6.859389 12.145491 7.289091
## [43] 8.902688 8.844229 7.967498 7.196298 12.799737 21.806891 8.163048
## [50] 16.860813 13.259467 9.071561 24.410827 13.200681 19.721716 19.721716
## [57] 11.708776 6.364545 10.263652 5.696189 12.567033 8.163048 13.117883
## [64] 13.513219 12.567033 15.464898 16.158613 17.385257 5.334974 34.149741
## [71] 14.786531 9.576794 9.954783 11.934555 14.457960 5.261782 11.923134
## [78] 21.414340 3.303600 3.303600 6.613851 18.350602 12.832129 24.410827
## [85] 5.334974 3.083185 10.241469 16.346673 15.758809 19.593698 6.000657
## [92] 7.796978 21.414340 18.145348 12.685021 8.520165 5.767077 10.739928
## [99] 14.890842 20.806033 5.423973 18.197598 7.796058 5.261782 16.220477
## [106] 8.804543 10.528573 10.378807 11.689726 11.227858 8.276915 13.390193
## [113] 14.457960 11.255072 15.266901 7.624908 6.123407 11.556829 12.567033
## [120] 5.603233 7.796058 18.145348 6.171322 19.868914 9.499031 8.478883
## [127] 9.746636 16.158613 18.245952 11.378978 8.478883 4.837180 12.799737
## [134] 3.416521 7.796058 20.865359 7.967498 20.093110 5.261782 4.893574
## [141] 13.562466 10.716731 10.643449 11.855798 6.082881 14.074218 5.732385
## [148] 11.327498 16.634908 9.987618 38.960972 21.131070 8.903341 7.289091
## [155] 17.164473 7.796058 17.905638 7.289091 47.534084 6.194887 22.692537
## [162] 10.500501 22.692537 8.300828 9.954783 14.786531 13.875012 12.737300
## [169] 10.923991 7.741729 10.083886 18.312262 17.571000 7.766130 6.933679
## [176] 25.882587 18.312262 10.742021 8.276915 14.399568 21.414340 17.978966
## [183] 9.181265 6.126505 5.783882 19.540936 14.074218 9.954783 6.123407
## [190] 11.399358 4.916928 5.999869 47.534084 15.882222 12.865842 13.513219
## [197] 8.870408 7.677090 9.753318 8.730911 7.796978 11.855798 16.634908
## [204] 15.158247 8.163048 13.711653 15.266901 4.916928 24.410827 10.378807
## [211] 26.925677 13.117883 11.568543 10.923991 5.783882 3.253317 8.294272
## [218] 6.194887 11.189485 9.931920 7.210865 6.123407 17.978966 7.766130
## [225] 11.448772 8.746441 18.350602 6.681593 11.661816 21.806891 8.903341
## [232] 5.084182 6.075646 6.634512 5.033111 15.158247 18.350602 5.705230
## [239] 16.860813 8.838920 11.982901 10.263652 24.410827 10.425369 12.954323
## [246] 5.627663 21.131070 11.189485 8.717593 6.126505 8.804543 7.741729
## [253] 9.208650 19.540936 10.754012 13.668936 12.214204 20.093110 24.560255
## [260] 9.746636 3.416345 8.746441 7.631827 7.967498 20.865359 12.479870
## [267] 10.739928 20.392370 16.297574 17.196970 7.543155 5.322923 8.717593
## [274] 11.227858 20.685614 18.312262 10.683295 5.627663 8.675187 5.448627
## [281] 5.467871 5.998285 7.707600 17.978966 16.297574 17.196970 10.715211
## [288] 11.716112 8.433534 11.689726 18.245952 6.213733 13.318870 13.513219
## [295] 25.882587 13.117883 20.392370 12.452186 5.998285 6.064092 4.453693
## [302] 14.523380 7.967498 6.171322 6.646520 8.439129 6.681593 3.083185
## [309] 15.266901 4.240356 9.499031 14.457960 13.648647 13.858305 17.905638
## [316] 13.711653 10.715211 6.171794 11.934555 23.360333 5.951457 6.432516
## [323] 19.016641 25.353496 6.364545 4.240356 21.131070 14.328320 13.736614
## [330] 23.360333 5.334974 15.429358 19.754734 5.504205 6.541213 12.452186
## [337] 18.197598 14.207833 16.229769 6.763125 24.410827 3.416345 7.837793
## [344] 18.016146 6.194887 25.353496 11.568543 6.000657 26.631753 13.137499
## [351] 9.746636 8.730911 11.255072 12.205877 5.705230 8.939781 28.839256
## [358] 12.865842 9.622106 23.360333 7.631827 10.686158 5.423973 14.316413
## [365] 7.210865 14.316413 10.100743 15.488123 13.747349 9.731331 21.134232
## [372] 9.208650 5.504205 5.730386 16.984642 20.685614 9.368690 8.442907
## [379] 20.392370 16.220477 10.806579 16.297574 7.232464 12.452186 12.723347
## [386] 9.731331 6.495000 22.692537 10.742021 20.441713 11.374880 13.748795
## [393] 9.731331 34.149741 17.164473 8.163048 18.637960 14.074218 20.685614
## [400] 20.441713 13.200681 15.991847 9.769501 4.453693 9.576794 16.229769
## [407] 10.241469 8.870408 8.939781 12.832129 5.448627 9.954783 14.328320
## [414] 13.736614 15.464898 7.707600 14.207833 18.245952 7.796978 12.145491
## [421] 5.767077 12.924752 12.737300 11.374880 19.016641 6.780655 7.757639
## [428] 9.933930 26.631753 4.821205 10.263652 13.485158 9.753318 19.754734
## [435] 12.441599 12.737300 16.346673 8.371008 3.553975 5.730386 8.838920
## [442] 9.181265 13.562466 9.933930 3.553975 11.661816 14.316413 8.675187
## [449] 5.732385 4.844452 22.931051 5.619202 21.806891 5.619202 13.485158
## [456] 15.882222 4.821205 8.717593 10.083886 11.448772 11.419118 11.923134
## [463] 16.220477 7.543155 47.534084 24.410827 5.627663 8.804543 13.318870
## [470] 12.865842 8.478883 10.715211 6.784111 7.416307 15.488123 18.197598
## [477] 19.540936 6.859389 9.987618 8.861752 6.780655 22.931051 15.765682
## [484] 5.998285 14.890842 10.378807 11.327498 8.294272 11.982901 13.318870
## [491] 5.767077 9.933930 11.399358 9.208650 12.954323 8.620682 18.637960
## [498] 5.696189 6.317307 16.984642 15.991847 12.214204 16.984642 5.467871
## [505] 4.821205 6.681593 9.499031 7.707600 25.882587 29.556340 11.982901
## [512] 11.556829 7.978755 11.227858 3.083185 13.200681 6.541213 5.448627
## [519] 13.200681 9.366377 11.934555 17.885162 8.661545 9.366377 10.100743
## [526] 4.837180 34.149741 7.796978 22.972734 15.882222 8.498527 7.837793
## [533] 10.716731 5.809265 6.000657 20.806033 3.969666 6.064092 8.844229
## [540] 15.158247 6.213733 5.440476 13.668936 9.622106 14.207833 15.488123
## [547] 13.736614 10.594284 5.732385 9.369839 11.419118 9.931920 21.134232
## [554] 16.158613 8.870408 19.003499 11.934555 7.631827 7.766130 20.392370
## [561] 6.784111 7.837793 6.495000 7.978755 13.875012 5.809265 6.261242
## [568] 7.416307 7.757639 26.925677 11.173832 6.171322 11.855798 7.978755
## [575] 6.075646 10.083886 16.634908 13.736614 13.668936 10.806579 16.798833
## [582] 5.440476 3.022686 11.556829 9.208650 17.905638 11.399358 28.839256
## [589] 7.624908 9.931920 13.858305 15.429358 13.648647 15.429358 12.973251
## [596] 9.366377 9.368690 24.410827 5.730386 10.089017 13.137499 11.189485
## [603] 8.433534 10.089017 3.553975 17.196970 11.423574 17.385257 12.973251
## [610] 29.556340 21.134232 11.780865 20.865359 19.721716 6.780655 3.969666
## [617] 10.606454 6.634512 3.253317 14.890842 13.562466 14.457960 18.787735
## [624] 10.643449 22.972734 11.419118 16.297574 24.560255 13.164780 13.369841
## [631] 13.748795 3.253317 5.467871 12.466745 7.567230 5.951457 16.346673
## [638] 8.433534 13.369841 6.784111 8.939781 14.399568 10.523994 11.944947
## [645] 3.416345 11.568543 4.821205 8.442907 15.991847 5.467871 8.439129
## [652] 6.194887 16.380300 18.312262 6.634512 4.893574 7.210865 8.717593
## [659] 11.399358 8.411755 12.466745 38.960972 6.082881 10.011945 16.380300
## [666] 19.868914 5.998285 5.705230 12.567033 25.353496 8.902688 7.567230
## [673] 9.369839 7.289091 17.164473 8.870408 6.763125 6.123407 12.441599
## [680] 18.637960 5.603233 14.786531 28.839256 11.556829 10.425369 22.931051
## [687] 27.869344 8.903341 25.882587 11.173832 13.747349 3.696858 18.324510
## [694] 10.089017 9.574540 12.723347 3.969666 10.263652 11.423574 12.723347
## [701] 7.416307 10.754012 11.716112 12.205877 14.523380 8.861752 6.859389
## [708] 4.240356 24.199980 13.748795 5.999869 10.754012 7.741729 16.380300
## [715] 18.384857 9.769501 11.780865 16.229769 3.696858 19.110606 8.587918
## [722] 7.677090 8.661545 10.552663 13.858305 8.498527 8.300828 5.999869
## [729] 11.189485 3.696858 3.083185 4.844452 14.523380 9.018784 27.869344
## [736] 3.416521 7.232464 8.371008 8.903341 7.370914 5.440476 12.441599
## [743] 17.164473 17.885162 9.753318 10.500501 8.587918 16.860813 3.416345
## [750] 17.885162 3.022686 12.145491 6.364545 11.944947 4.837180 11.923134
## [757] 10.742021 20.685614 4.453693 8.902688 14.328320 5.033111 5.999869
## [764] 13.259467 21.414340 11.255072 11.255072 19.593698 16.798833 17.196970
## [771] 12.799737 8.520165 8.838920 13.485158 6.432516 10.500501 7.624908
## [778] 10.739928 9.622106 7.715376 18.145348 10.806579 8.442907 17.571000
## [785] 8.746441 8.675187 19.003499 15.765682 18.787735 10.378807 6.933679
## [792] 6.933679 19.593698 24.199980 10.643449 12.685021 12.799737 13.485158
## [799] 15.464898 10.606454 8.371008 20.441713 10.716731 4.844452 6.784111
## [806] 13.390193 14.328320 10.202440 14.523380 5.504205 14.074218 5.705230
## [813] 15.758809 18.324510 12.685021 13.747349 9.769501 7.677090 13.711653
## [820] 20.865359 9.576794 15.758809 6.763125 6.317307 12.466745 6.171794
## [827] 6.126505 6.859389 10.202440 6.522142 10.500501 17.885162 9.369839
## [834] 19.721716 17.385257 18.350602 5.809265 18.016146 18.384857 11.448772
## [841] 15.885833 12.214204 10.100743 15.292133 9.181265 13.875012 12.214204
## [848] 12.466745 13.259467 7.370914 5.730386 19.754734 7.370914 8.371008
## [855] 13.369841 16.798833 10.552663 10.606454 10.552663 10.528573 4.844452
## [862] 8.587918 15.464898 7.567230 15.292133 8.478883 4.916928 4.893574
## [869] 17.571000 5.084182 6.432516 9.987618 15.158247 11.378978 15.292133
## [876] 6.541213 9.576794 5.423973 9.191463 8.411755 10.683295 5.696189
## [883] 10.851811 8.300828 15.885833 19.540936 9.499031 12.205877 8.498527
## [890] 22.931051 11.780865 20.093110 6.933679 27.869344 10.528573 13.137499
## [897] 22.972734 3.303600 12.924752 8.620682 10.643449 12.924752 15.758809
## [904] 15.266901 3.022686 10.742021 7.196298 3.684812 8.439129 8.730911
## [911] 6.613851 11.374880 10.754012 12.479870 11.423574 8.844229 8.294272
## [918] 3.553975 9.622106 24.560255 9.574540 10.806579 5.322923 6.317307
## [925] 8.902688 10.851811 11.227858 26.925677 6.522142 6.541213 8.294272
## [932] 11.689726 9.132464 24.560255 6.126505 17.571000 10.923991 8.587918
## [939] 13.390193 5.619202 9.071561 6.646520 13.164780 11.944947 21.806891
## [946] 11.708776 8.661545 7.757639 15.885833 10.683295 12.865842 13.137499
## [953] 11.173832 8.520165 5.951457 10.686158 5.033111 6.613851 19.868914
## [960] 13.164780 6.646520 10.011945 19.110606 9.071561 9.018784 20.806033
## [967] 6.213733 24.199980 18.324510 6.064092 24.410827 18.016146 16.220477
## [974] 12.954323 7.543155 9.574540 15.429358 11.661816 8.276915 19.016641
## [981] 11.378978 9.574540 10.594284 10.523994 8.620682 10.686158 29.556340
## [988] 5.696189 9.181265 11.716112 5.084182 16.860813 6.261242 26.631753
## [995] 7.232464 5.322923 14.399568 9.191463 17.385257 19.003499 12.973251
## [1002] 7.196298 11.423574 19.110606 9.132464 11.780865 10.241469 11.708776
## [1009] 5.603233 11.689726 10.528573 5.783882 15.488123 38.960972 9.018784
## [1016] 8.861752 13.369841 10.739928 11.327498 6.082881 9.368690 6.075646
## [1023] 8.804543 6.261242 9.191463
heart$pvalues <- pchisq(heart_MD, df=3, lower.tail=FALSE)
heart
## # A tibble: 1,025 × 15
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 1 0 125 212 0 1 168 0 1 2
## 2 53 1 0 140 203 1 0 155 1 3.1 0
## 3 70 1 0 145 174 0 1 125 1 2.6 0
## 4 61 1 0 148 203 0 1 161 0 0 2
## 5 62 0 0 138 294 1 1 106 0 1.9 1
## 6 58 0 0 100 248 0 0 122 0 1 1
## 7 58 1 0 114 318 0 2 140 0 4.4 0
## 8 55 1 0 160 289 0 0 145 1 0.8 1
## 9 46 1 0 120 249 0 0 144 0 0.8 2
## 10 54 1 0 122 286 0 0 116 1 3.2 1
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## # target <dbl>, pvalues <dbl>
# Create a matrix of categorical variables
# heart_cat = subset(heart, select = c(sex,cp, fbs, restecg, exang, slope, ca, thal))
#
# heart_cat_matrix <- as.matrix(heart_cat)
# heart_cat_matrix
#
# dim(heart_cat_matrix)
# length(heart$target)
# The warning message suggests that the chi-squared approximation may be incorrect,
# which can happen when the expected frequency counts are too low. One way to address this
# is to merge categories with low frequencies into a single category.
# For example, let's say that the categories "0" and "1" have low frequencies in the "thal" column.
# We can merge them into a single category "0/1" using the following code:
heart$thal
## [1] 3 3 3 3 2 2 1 3 3 2 2 3 2 3 0 2 2 3 2 2 2 2 2 2 2 3 2 2 1 2 1 2 3 3 2 2 2
## [38] 2 3 3 2 3 2 3 2 2 1 3 2 3 2 3 2 3 3 3 3 2 3 2 2 2 2 2 2 3 2 1 2 3 3 3 2 3
## [75] 3 2 2 3 2 2 3 2 2 2 2 2 2 2 2 3 2 2 3 1 2 2 2 3 1 2 2 3 3 2 1 2 3 3 3 2 2
## [112] 3 3 3 1 3 3 3 2 2 3 1 3 2 3 2 2 2 2 1 2 2 1 2 3 1 2 2 2 2 3 2 2 2 2 3 2 3
## [149] 3 2 1 3 3 3 3 3 3 3 3 2 2 2 2 3 2 3 1 2 2 2 2 3 2 2 3 3 3 2 2 3 3 2 3 2 2
## [186] 3 3 2 3 3 2 3 3 2 2 2 3 2 2 3 2 2 3 3 2 2 1 2 2 3 3 2 3 2 2 2 3 2 3 2 2 3
## [223] 2 2 2 3 2 2 2 3 3 2 3 2 2 3 2 3 3 2 2 3 2 2 2 2 3 3 2 2 2 2 3 3 3 2 2 2 3
## [260] 2 2 3 2 2 1 2 3 2 3 2 3 3 2 2 1 3 3 2 3 2 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 2
## [297] 2 3 2 2 2 3 2 3 2 3 2 2 1 2 3 3 3 2 3 2 3 2 3 0 2 2 3 1 2 2 3 1 2 0 2 2 2
## [334] 2 3 3 3 2 1 3 2 2 2 3 2 1 3 2 3 3 2 3 3 1 3 2 1 2 2 0 2 3 2 2 2 2 3 3 3 2
## [371] 3 3 2 2 2 1 2 2 2 1 3 3 3 3 3 2 2 2 2 1 2 3 2 3 3 2 3 3 1 1 3 2 2 2 3 1 2
## [408] 3 2 2 2 2 1 2 3 2 2 2 2 2 2 2 2 2 3 3 2 2 3 2 3 3 2 2 2 2 2 3 2 2 2 3 3 2
## [445] 2 2 2 3 2 2 3 2 3 2 3 2 2 2 2 2 3 2 1 3 3 2 2 2 2 2 2 3 2 2 3 3 3 3 2 3 3
## [482] 3 3 2 1 3 3 3 2 2 2 2 3 3 2 3 3 2 2 2 2 2 2 2 2 2 3 2 3 2 2 3 2 2 2 3 3 2
## [519] 3 3 3 3 2 3 3 2 3 2 3 2 3 2 2 2 2 2 2 2 3 3 3 2 2 2 2 3 2 2 2 3 3 2 3 2 3
## [556] 3 3 2 2 2 2 2 2 2 1 2 2 2 2 3 2 3 2 2 3 2 3 2 2 3 3 2 2 3 3 3 3 1 3 2 2 2
## [593] 3 2 1 3 2 2 2 2 3 3 2 2 2 2 2 1 1 2 3 3 1 3 3 2 2 2 2 1 3 3 2 2 3 3 3 3 2
## [630] 2 3 2 2 2 3 2 2 2 2 2 2 3 3 3 2 3 2 2 2 2 3 2 2 3 2 2 2 2 3 2 2 1 2 2 2 2
## [667] 2 3 2 1 2 3 3 3 3 3 3 3 2 3 2 3 1 3 2 3 0 3 3 2 3 2 3 2 3 3 2 3 2 3 2 3 2
## [704] 1 3 3 3 2 2 3 3 3 2 2 2 2 3 1 2 3 2 2 2 2 2 3 3 3 3 2 2 2 3 2 0 2 3 3 3 3
## [741] 2 2 3 3 2 2 2 3 2 3 2 2 2 3 2 2 2 1 2 2 1 2 3 2 3 3 3 3 3 2 1 2 2 3 2 2 3
## [778] 3 2 2 1 3 2 2 3 3 3 3 2 3 3 3 3 2 2 2 1 3 3 2 3 1 2 2 2 3 1 2 3 2 3 3 2 3
## [815] 2 3 2 2 2 1 3 2 3 2 2 2 2 3 2 3 2 3 3 3 1 2 2 3 2 2 3 2 3 2 3 1 2 2 2 3 2
## [852] 2 3 3 2 3 2 2 2 3 2 2 3 3 2 2 2 2 2 2 2 2 3 1 2 3 3 2 3 2 3 2 3 3 3 3 3 1
## [889] 3 3 3 2 3 0 3 3 3 2 2 3 2 2 2 1 2 2 2 2 3 3 3 2 3 2 2 3 3 2 2 3 3 3 3 2 2
## [926] 3 2 3 3 3 3 3 2 3 2 2 2 2 3 2 3 2 2 3 3 3 2 2 3 3 2 3 2 2 2 3 2 3 2 2 2 2
## [963] 3 3 2 2 3 2 3 2 2 3 1 2 3 3 2 2 2 3 1 3 2 3 3 3 2 2 3 2 2 3 2 3 3 3 3 3 1
## [1000] 3 1 2 2 3 2 3 2 3 2 3 3 2 3 1 2 3 2 3 3 2 2 3 2 2 3
heart$thal <- as.character(heart$thal) # convert to character
heart$thal[heart$thal %in% c("0", "1")] <- "0/1" # merge categories
heart$thal <- factor(heart$thal) # convert back to factor
# Convert categorical variables to factors
heart$sex <- as.factor(heart$sex)
heart$cp <- as.factor(heart$cp)
heart$fbs <- as.factor(heart$fbs)
heart$restecg <- as.factor(heart$restecg)
heart$exang <- as.factor(heart$exang)
heart$slope <- as.factor(heart$slope)
heart$ca <- as.factor(heart$ca)
heart$thal <- as.factor(heart$thal)
# Create contingency tables
sex_table <- table(heart$sex, heart$target)
cp_table <- table(heart$cp, heart$target)
fbs_table <- table(heart$fbs, heart$target)
restecg_table <- table(heart$restecg, heart$target)
exang_table <- table(heart$exang, heart$target)
slope_table <- table(heart$slope, heart$target)
ca_table <- table(heart$ca, heart$target)
thal_table <- table(heart$thal, heart$target)
# Perform chi-squared test on each table
sex_chi <- chisq.test(sex_table)
cp_chi <- chisq.test(cp_table)
fbs_chi <- chisq.test(fbs_table)
restecg_chi <- chisq.test(restecg_table)
exang_chi <- chisq.test(exang_table)
slope_chi <- chisq.test(slope_table)
ca_chi <- chisq.test(ca_table)
thal_chi <- chisq.test(thal_table)
# Print p-values for each variable
print(paste0("Sex p-value: ", sex_chi$p.value))
## [1] "Sex p-value: 6.65682068172645e-19"
print(paste0("Chest pain type p-value: ", cp_chi$p.value))
## [1] "Chest pain type p-value: 1.29806646948206e-60"
print(paste0("Fasting blood sugar > 120 mg/dl p-value: ", fbs_chi$p.value))
## [1] "Fasting blood sugar > 120 mg/dl p-value: 0.218624131028943"
print(paste0("Resting electrocardiographic results p-value: ", restecg_chi$p.value))
## [1] "Resting electrocardiographic results p-value: 1.69642510038776e-08"
print(paste0("Exercise induced angina p-value: ", exang_chi$p.value))
## [1] "Exercise induced angina p-value: 2.82663742966344e-44"
print(paste0("ST segment slope p-value: ", slope_chi$p.value))
## [1] "ST segment slope p-value: 1.42108523925458e-34"
print(paste0("Number of major vessels colored by flourosopy p-value: ", ca_chi$p.value))
## [1] "Number of major vessels colored by flourosopy p-value: 1.74701345104618e-54"
print(paste0("Thalassemia p-value: ", thal_chi$p.value))
## [1] "Thalassemia p-value: 1.52159810768481e-61"
# Create contingency tables for each feature
heart_cat <- heart[, c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal")]
cont_tables <- lapply(heart_cat, function(x) table(x, heart$target))
cont_tables
## $sex
##
## x 0 1
## 0 86 226
## 1 413 300
##
## $cp
##
## x 0 1
## 0 375 122
## 1 33 134
## 2 65 219
## 3 26 51
##
## $fbs
##
## x 0 1
## 0 417 455
## 1 82 71
##
## $restecg
##
## x 0 1
## 0 283 214
## 1 204 309
## 2 12 3
##
## $exang
##
## x 0 1
## 0 225 455
## 1 274 71
##
## $slope
##
## x 0 1
## 0 46 28
## 1 324 158
## 2 129 340
##
## $ca
##
## x 0 1
## 0 163 415
## 1 160 66
## 2 113 21
## 3 60 9
## 4 3 15
##
## $thal
##
## x 0 1
## 0/1 47 24
## 2 132 412
## 3 320 90
# Perform chi-squared tests for each contingency table
chi_results <- lapply(cont_tables, function(x) chisq.test(x))
chi_results
## $sex
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: x
## X-squared = 78.863, df = 1, p-value < 2.2e-16
##
##
## $cp
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 280.98, df = 3, p-value < 2.2e-16
##
##
## $fbs
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: x
## X-squared = 1.5134, df = 1, p-value = 0.2186
##
##
## $restecg
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 35.784, df = 2, p-value = 1.696e-08
##
##
## $exang
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: x
## X-squared = 194.82, df = 1, p-value < 2.2e-16
##
##
## $slope
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 155.87, df = 2, p-value < 2.2e-16
##
##
## $ca
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 257.29, df = 4, p-value < 2.2e-16
##
##
## $thal
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 280.08, df = 2, p-value < 2.2e-16
# Extract p-values from each test
p_values <- sapply(chi_results, function(x) x$p.value)
p_values
## sex cp fbs restecg exang slope
## 6.656821e-19 1.298066e-60 2.186241e-01 1.696425e-08 2.826637e-44 1.421085e-34
## ca thal
## 1.747013e-54 1.521598e-61
# Choose a significance threshold
sig_threshold <- 0.05
# Select significant features based on p-values
sig_features <- names(p_values[p_values < sig_threshold])
# Print significant features
print(sig_features)
## [1] "sex" "cp" "restecg" "exang" "slope" "ca" "thal"
# So, we got the significant features:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"
# chi square plot
plot(qchisq((1:nrow(heart_x) - 1/2) / nrow(heart_x), df = 3), sort(d),
xlab = expression(paste(chi[3]^2, " Quantile")),
ylab = "Ordered distances")
abline(a = 0, b = 1)

# From the chi square plot, we see that the distances appear to be roughly linearly related
# to the quantiles of the chi-squared distribution, and the points fall close to the diagonal line.
# This suggests that the Mahalanobis distance metric may be appropriate for this dataset.
# However, further analysis and interpretation may be needed to draw more definitive conclusions.
# Correlations
pairs(heart_x)

# Now, let us see the distribution of Heart Disease
# First, we will be looking at how many data points do we have where the client has or does not
# have heart disease. We would like to make sure that the dataset is has a balance of
# people with and without heart disease in order to create an accurate model.
heart %>%
count(target)
## # A tibble: 2 × 2
## target n
## <dbl> <int>
## 1 0 499
## 2 1 526
# As we can see from the result the dataset is also balanced as we have 526 datapoints
# which dont have heart disease and 499 data points with heart disease.
# While the dataset might seem balanced on the surface when we look further we will see that
# the dataset is not balanced based on gender
# 1) Hypothesis 1
# Sex: Studies have shown that men are more likely to develop heart disease than women.
# In the Heart Disease dataset, we can use a contingency table and chi-square test to
# investigate the association between sex and the presence of heart disease.
# Which population is diagnosed more with a heart disease? Male Population or Female population?
# The signs of a woman having a heart attack are much less noticeable than the signs of a male.
# In women, heart attacks may feel uncomfortable squeezing, pressure, fullness, or pain in the
# center of the chest. It may also cause pain in one or both arms, the back, neck, jaw or stomach,
# shortness of breath, nausea and other symptoms. Men experience typical symptoms of heart attack,
# such as chest pain , discomfort, and stress. They may also experience pain in other areas,
# such as arms, neck , back, and jaw, and shortness of breath, sweating, and discomfort that
# mimics heartburn.
# source: healthblog.uofmhealth
# The proportion of females and males in the dataset.
heart %>%
group_by(sex) %>%
summarise(percent = 100 * n() / nrow(heart))
## # A tibble: 2 × 2
## sex percent
## <fct> <dbl>
## 1 0 30.4
## 2 1 69.6
# sex percent
# <fct> <dbl>
# 0 30.4
# 1 69.6
# There are 30.4 % females and 69.6% males in the dataset
# Distribution of Heart Disease among males and females
# Create a contingency table of sex and target (heart disease)
Sub_female <- table(heart[heart$sex==0,]$target)
Sub_male <- table(heart[heart$sex==1,]$target)
FM_combine <- rbind(Sub_female,Sub_male)
#Rename columns names and rows names.
colnames(FM_combine) <- c("Has heart disease", "Does not have heart disease")
rownames(FM_combine) <- c("Females", "Males")
#Display the table
FM_combine
## Has heart disease Does not have heart disease
## Females 86 226
## Males 413 300
# There are 86 females out of 312 who have diagnosed with heart disease and 413 males out of 713
# were diagnosed with heart disease.
# This indicates that 58% of males in this dataset are diagnosed with heart disease
# where is only 27% of females are diagnosed with heart disease.
# Perform a chi-square test on the contingency table
chi_test <- chisq.test(FM_combine)
chi_test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: FM_combine
## X-squared = 78.863, df = 1, p-value < 2.2e-16
# The output from performing Pearson's chi-squared test with Yates' continuity correction
# on a contingency table:
# Pearson's Chi-squared test with Yates' continuity correction
# data: cont_table
# X-squared = 78.863, df = 1, p-value < 2.2e-16
# The chi-squared test tests the null hypothesis that there is no association between the two
# variables, and the alternative hypothesis that there is a significant association.
# The test statistic X-squared is 78.863 with 1 degree of freedom (df) and a p-value less than
# 2.2e-16, which is smaller than any significance level commonly used (0.05).
# This indicates strong evidence against the null hypothesis and suggests that there is a
# significant association between the "sex" and "target" variables in the Heart Disease dataset.
# Specifically, the test suggests that men are more likely to have heart disease than women
# since the contingency table shows that a higher proportion of men have heart disease compared
# to women, and so we can infer that men are more likely to have heart disease than women.
# This is a mosaic plot, helps to visualize the statistical association between two variables.
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
##
## The following object is masked from 'package:Matrix':
##
## mean
##
## The following objects are masked from 'package:pROC':
##
## cov, var
##
## The following objects are masked from 'package:infer':
##
## prop_test, t_test
##
## The following object is masked from 'package:scales':
##
## rescale
##
## The following object is masked from 'package:purrr':
##
## cross
##
## The following object is masked from 'package:ggplot2':
##
## stat
##
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
##
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
##
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
mosaicplot(heart$sex ~ heart$target,
main="Heart disease outcome by Gender", shade=FALSE,color=TRUE,
xlab="Gender", ylab="Heart disease")

# Ans: We can conclude that males are diagnosed more with a heart disease than females
# 2) Hypothesis
# Age: As age increases, the likelihood of developing heart disease also increases.
# This is supported by previous studies on cardiovascular disease. In the Heart Disease dataset,
# we can use a scatter plot to visualize the relationship between age and the presence of heart
# disease, and a correlation test to measure the strength of the relationship.
ggplot(heart, aes(x = age, y = factor(target), color = target)) +
geom_point() +
labs(x = "Age", y = "Presence of Heart Disease", color = "Heart Disease")

# heart$target <- as.numeric(heart$target)
cor.test(heart$age, heart$target, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: x and y
## t = -7.5356, df = 1023, p-value = 1.068e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2865322 -0.1704854
## sample estimates:
## cor
## -0.2293236
# true correlation is not equal to 0 means that there is a significant linear relationship
# between the two variables being tested.
# Pearson's product-moment correlation
#
# data: heart$age and heart$target
# t = -7.5356, df = 1023, p-value = 1.068e-13
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.2865322 -0.1704854
# sample estimates:
# cor
# -0.2293236
# If the p-value is less than the chosen significance level (0.05), we can conclude
# that there is a significant correlation between age and the presence of heart disease.
# The test statistic is the t-value, which in this case is -7.5356.
# The degrees of freedom (df) are 1023, and the p-value is 1.068e-13, which is very small
# and indicates that there is a significant correlation between age and target.
# "alternative hypothesis: true correlation is not equal to 0":
# This line specifies the alternative hypothesis, which is that there is a non-zero
# correlation between age and target.
# "95 percent confidence interval: -0.2865322 -0.1704854":
# This line provides the 95% confidence interval for the correlation coefficient.
# In this case, it ranges from -0.2865322 to -0.1704854.
# "sample estimates: cor -0.2293236":
# This line provides the sample estimate of the correlation coefficient, which is -0.2293236.
# This indicates a moderate negative correlation between age and target.
# Based on the output of the Pearson's correlation test, we can conclude that there is a
# statistically significant negative correlation between age and the presence of heart disease
# (target variable) in the Heart Disease dataset.
# The correlation coefficient is -0.2293, which indicates a moderate negative correlation
# between the two variables. As age increases, the likelihood of having heart disease decreases.
# However, it's important to note that correlation does not necessarily imply causation,
# and further analysis would be needed to establish causality.
# Let us create a A faceted histogram which shows the relationship between heart disease and age
age.plot <- ggplot(heart, mapping = aes(x = age, fill = target)) +
geom_histogram() +
facet_wrap(vars(target)) +
labs(title = "Prevelance of Heart Disease Across Age", x = "Age (years)", y = "Count", fill = "Heart Disease")
age.plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# The histograms for age faceted on the presence and absence of heart disease have
# different distribution shapes, suggesting that age does have a relationship with heart disease.
# The distribution of the presence of heart disease is left skewed while the distribution
# of the absence of heart disease appears more normally distributed.
# These graphics suggests that there are more older people with heart disease than younger
# people with heart disease.
# This is a boxplot to displays the age distribution of heart diagnosis.
boxplot(heart$age ~ heart$target,
main="Heart disease diagnosis distribution by Age",
ylab="Age",xlab="Heart disease diagnosed")

# Ans: We can conclude that people with a higher age are more likely diagnised with a heart disease
# 3) Hypothesis 3
# Chest pain type: The type of chest pain a patient experiences can provide important
# diagnostic information about heart disease.
# cp: chest pain type
# — Value 0: asymptomatic
# — Value 1: atypical angina
# — Value 2: non-anginal pain
# — Value 3: typical angina
#The proportion of patients with chest pain types.
heart %>% group_by(cp) %>%
summarise( percent = 100 * n() / nrow( heart ))
## # A tibble: 4 × 2
## cp percent
## <fct> <dbl>
## 1 0 48.5
## 2 1 16.3
## 3 2 27.7
## 4 3 7.51
# cp percent
# <fct> <dbl>
# 0 48.5
# 1 16.3
# 2 27.7
# 3 7.51
# There are 7% of patient with typical angina chest pain, 16 % of patients with atypical angina,
# 27% with non-angina pain, and 48% with asymptomatic chest pain.
# In the Heart Disease dataset, we can use a box plot to visualize the distribution of chest
# pain type among patients with and without heart disease, and a t-test to compare the
# means of the two groups.
# create a box plot to visualize the distribution of chest pain type
ggplot(heart, aes(x = target, y = factor(cp))) +
geom_boxplot() +
xlab("Presence of Heart Disease") +
ylab("Chest Pain Type")

# The boxplot suggests that chest pain type may be a significant predictor of heart disease in
# patients.
# This plot to visualize the Heart disease diagnosis Distributions by Chest pain.
# There are four types of chest pain:(0) asymptomatic, (1) atypical angina, (2) non-anginal pain,
# and (3) typical angina.
ggplot(data = heart, aes(x = target, fill = cp)) +
geom_bar(position = "fill") +
labs(title = "Heart disease diagnosis Distributions by Chest pain",
x = "Heart disease diagnosis",
y = "chest pain") +
theme_test()

str(heart)
## spc_tbl_ [1,025 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : Factor w/ 2 levels "0","1": 2 2 2 2 1 1 2 2 2 2 ...
## $ cp : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 1 1 1 ...
## $ restecg : Factor w/ 3 levels "0","1","2": 2 1 2 2 2 1 3 1 1 1 ...
## $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : Factor w/ 3 levels "0","1","2": 3 1 1 3 2 2 1 2 3 2 ...
## $ ca : Factor w/ 5 levels "0","1","2","3",..: 3 1 1 2 4 1 4 2 1 3 ...
## $ thal : Factor w/ 3 levels "0/1","2","3": 3 3 3 3 2 2 1 3 3 2 ...
## $ target : num [1:1025] 0 0 0 0 0 1 0 0 0 0 ...
## $ pvalues : num [1:1025] 0.055856 0.000377 0.001197 0.024796 0.000302 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
heart$target <- as.numeric(heart$target)
heart$cp <- as.numeric(heart$cp)
# perform a t-test to compare the means of chest pain type between patients with and without heart disease
t.test(cp ~ target, data = heart)
##
## Welch Two Sample t-test
##
## data: cp by target
## t = -15.462, df = 1022.9, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.0089917 -0.7817304
## sample estimates:
## mean in group 0 mean in group 1
## 1.482966 2.378327
# Welch Two Sample t-test
#
# data: cp by target
# t = -15.462, df = 1022.9, p-value < 2.2e-16
# alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
# 95 percent confidence interval:
# -1.0089917 -0.7817304
# sample estimates:
# mean in group 1 mean in group 2
# 1.482966 2.378327
# This is the output of a Welch two-sample t-test which is used to test if there is a significant
# difference between the means of two groups. In this case, the groups are defined by the presence
# or absence of heart disease (target variable) and the chest pain type (cp variable).
# The output shows the t-value, degrees of freedom (df), and the p-value.
# The null hypothesis is that the means of the two groups are equal,
# and the alternative hypothesis is that they are not equal.
# A p-value less than the significance level (typically 0.05) indicates that there is
# evidence to reject the null hypothesis.
# In this case, the p-value is extremely small (less than 2.2e-16),
# so we can conclude that there is a significant difference between the means of the two groups.
# The 95% confidence interval shows the range within which we can be 95% confident that the
# true difference between the means of the two groups lies.
# The sample estimates show the mean value of chest pain type (cp variable) for the two groups,
# with group 0 indicating those without heart disease and group 1 indicating those with
# heart disease.
# A a bar chart is shown for the relationship between heart disease and chest pain type.
cp.plot <- ggplot(heart, mapping = aes(x=target, fill = factor(cp))) +
geom_bar(position = "dodge") +
labs(title = "Prevelance of Heart Disease for Different Chest Pain Types", x = "Heart Disease", y = "Count", fill = "Chest Pain Type")
cp.plot

# There does appear to be a relationship between type of chest pain and heart disease.
# Interestingly, asymptomatic chest pain type (Value 0) has the highest count for the presence of
# heart disease, while typical angina pain has the lowest count. There is a higher count
# of people without heart disease that have atypical or typical angina chest pain compared
# to people with heart disease. Angina is listed as one of the most common symptoms of
# heart attack and so this result is skeptical and needs further investigation, but we will
# assume it is correct for the current analysis.
# Ans: Therefore, chest pain type is a significant predictor of heart disease in patients.
attach(heart)
# Visualizations of each variable are created in order to familiarize ourselves with
# the predictors.
# Proportional bar charts are created for our 3 binary variables: sex, fasting blood sugar,
# and exercise induced angina. Box plots are shown for our 3 numerical variables:
# resting blood pressure, serum cholesterol, and maximum heart rate.
heart$target <- as.factor(heart$target)
# sex: The person’s sex (1 = male, 0 = female)
sex.plot <- ggplot(heart, mapping = aes(x = sex, fill = target)) +
geom_bar(position = "fill") +
labs(x = "Sex", y = "Proportion", fill = "Heart Disease") +
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))
# fbs: The person’s fasting blood sugar (> 120 mg/dl, 1 = true; 0 = false)
fbs.plot <- ggplot(heart, mapping = aes(x=fbs, fill=target)) +
geom_bar(position = "fill") +
labs(x = "Fasting Blood Sugar", y = "Proportion", fill = "Heart Disease") +
scale_x_discrete(labels = c("low", "high"))+
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))
# exang: Exercise induced angina (1 = yes; 0 = no)
exang.plot <- ggplot(heart, mapping = aes(x = exang, fill = target)) +
geom_bar(position = "fill") +
labs(x = "Exercise induced angina", y = "Proportion", fill = "Heart Disease") +
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12))
grid.arrange(sex.plot, fbs.plot, exang.plot, nrow=2)

# The bar plot on the top left shows a higher proportion of males with heart disease than
# females with heart disease, suggesting that there is a relationship between sex and heart disease.
# There is an even larger distinction for heart disease as it relates to exercise induced angina,
# for a much higher proportion of people with exercise induced angina have heart disease compared
# to people without exercise induced angina (bottom left plot).
# Fasting blood sugar levels do not appear to have a correlation with heart disease, as there
# appears to be a similar proportion of presence and absence of heart disease for people with
# high and low fasting blood sugar levels (top right plot).
# More visualizations
# trestbps: The person’s resting blood pressure (mm Hg on admission to the hospital)
trestbps.plot <- ggplot(heart, mapping = aes(x=trestbps, y=target)) +
geom_boxplot() +
labs(x = "Resting Blood Pressure (mm Hg)", y = "Heart Disease") +
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))
# chol: The person’s cholesterol measurement in mg/dl
chol.plot <- ggplot(heart, mapping = aes(x=chol, y=target)) +
geom_boxplot() +
labs(x = "Serum Cholestoral (mg/dl)", y = "Heart Disease") +
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))
# thalach: The person’s maximum heart rate achieved
maxhr.plot <- ggplot(heart, mapping = aes(x = thalach, y = target)) +
geom_boxplot() +
labs(x = "Maximum Heart Rate (bpm)", y = "Heart Disease") +
theme(axis.text.x = element_text(size = 12), axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12), axis.text.y = element_text(size = 12))
grid.arrange(trestbps.plot, chol.plot, maxhr.plot, nrow=2)

# The box plots for resting blood pressure (top left) appear similar for the presence and
# absence of heart disease; the boxes (middle 50% of data) overlap and have similar centers
# and boundaries. These similar distributions suggest that there is likely not a strong
# relationship between resting blood pressure and heart disease.
# No strong relationship between serum cholesterol and heart disease is visible either,
# for these distributions are close in center and spread (top right).
# However, the last box plot does show a relationship between maximum heart rate achieved
# during exercise and heart disease. The trend of maximum heart rate appears to be higher for
# the absence of heart disease compared to the presence of heart disease, and there is
# little overlap between the two boxes.
# And, to visualize the relationship between restecg and target, we can create a stacked bar plot.
# This plot shows the count of each value of restecg for each value of target.
# restecg: resting electrocardiographic results
# Value 0: showing probable or definite left ventricular hypertrophy by Estes’ criteria
# Value 1: normal
# Value 2: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
ggplot(heart, aes(x = target, fill = factor(restecg))) +
geom_bar()

# From the above plot, we can infer that restecg value 0 and value 2 is found more in common with
# people having a heart disease.
# So, restecg is also a important feature.
# From the above observations, we got the significant features which is correct:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"
# More useful visualizations
# Another plot to visualize heart disease diagnosis Distributions by Number of major vessels.
# ca: The number of major vessels (0–3)
ggplot(data = heart, aes(x = target, fill = ca)) +
geom_bar(position = "fill") +
labs(title = "Heart disease diagnosis Distributions by Number of major vessels ",
x = "Heart disease diagnosis",
y = "thal") +
theme_test()

# Here is the scatterplot showing the relationship between age and maximum heart rate achieved
# in the heart dataset:
ggplot(heart, aes(x=age, y=thalach, color=target)) +
geom_point(alpha=0.5) +
labs(x="Age", y="Max heart rate", color="Target") +
theme_minimal()

# The downward trend indicates that as age increases, maximum heart rate achieved tends to decrease.
# This relationship appears to be somewhat linear, although there is some variability in the data.
# Histogram of patient’s age and gender
meanAge <- data.frame(sex = c(0, 1), age = c(mean(heart[heart$sex==0,]$age),mean(heart[heart$sex==1,]$age)))
#ggplot of age of the patients categorized by sex
Plot <- ggplot(heart, aes(x=age, fill=as.factor(sex))) +
geom_histogram(alpha=0.5, position="identity")+
geom_vline(aes(xintercept = age), meanAge)+
facet_wrap(~as.factor(sex))+
labs(title="Histogram of patients's age by gender",
x="Age of patients", y="Count", fill="Sex")+
geom_text(meanAge, mapping=aes(x=age, y=8.5, label=paste("Mean=", signif(age,4))),
size=4, angle=90, vjust=-0.4, hjust=0)+
scale_fill_discrete(breaks=c("0", "1"),
labels=c("0 - Female", "1 - Male"))
#display the Plot
Plot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Heart disease diagnosis frequency by Resting electrocardiographic results and sex
heart$restecg <- as.factor(heart$restecg)
heart %>%
ggplot(aes(x = target, fill=restecg)) +
geom_bar(position = "dodge") +
facet_grid(~sex) +
scale_fill_brewer(palette = "Dark2") +
labs(title="Heart disease diagnosis frequency by restecg and sex")

heart$age <- as.numeric(heart$age)
heart$sex <- as.numeric(heart$sex)
heart$cp <- as.numeric(heart$cp)
heart$trestbps <- as.numeric(heart$trestbps)
heart$chol <- as.numeric(heart$chol)
heart$fbs <- as.numeric(heart$fbs)
heart$restecg <- as.numeric(heart$restecg)
heart$thalach <- as.numeric(heart$thalach)
heart$exang <- as.numeric(heart$exang)
heart$oldpeak <- as.numeric(heart$oldpeak)
heart$slope <- as.numeric(heart$slope)
heart$ca <- as.numeric(heart$ca)
heart$thal <- as.numeric(heart$thal)
correlations <- cor(heart[,1:13])
corrplot(correlations, method="circle")

# A dot-representation was used where blue represents positive correlation and red negative.
# The larger the dot the larger the correlation.
# -> As age increases, cholesterol, stress of heart during exercise and resting bp also increase.
# On the other hand, maximum heart rate falls with old age.
# -> As cholesterol increases, stress of heart during exercise and resting bp increase,
# while maximum heart rate falls.
# -> As ST depression rises, i.e. stress of the heart falls, resting bp rises.
# -> Resting bp also has a negative relation with maximum heart rate.
# -> The degree of correlation is very small between all variables. However, age and maximum
# heart rate show a slightly higher correlation.
# -> St depression and maximum heart rate also show similar results.
# Feature Importance
# There are different ways to identify the important features in the data.
#
# 1- Correlation
#
# 2- Random Forest: Gini Importance or Mean Decrease in Impurity (MDI) calculates each feature
# importance as the sum over the number of splits (across all tress) that include the feature,
# proportionally to the number of samples it splits.
corelations = data.frame(cor(heart[,1:13], use = "complete.obs"))
corelations
## age sex cp trestbps chol
## age 1.00000000 -0.10324030 -0.07196627 0.27112141 0.21982253
## sex -0.10324030 1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp -0.07196627 -0.04111909 1.00000000 0.03817742 -0.08164102
## trestbps 0.27112141 -0.07897377 0.03817742 1.00000000 0.12797743
## chol 0.21982253 -0.19825787 -0.08164102 0.12797743 1.00000000
## fbs 0.12124348 0.02720046 0.07929359 0.18176662 0.02691716
## restecg -0.13269617 -0.05511721 0.04358061 -0.12379409 -0.14741024
## thalach -0.39022708 -0.04936524 0.30683928 -0.03926407 -0.02177209
## exang 0.08816338 0.13915681 -0.40151271 0.06119697 0.06738223
## oldpeak 0.20813668 0.08468656 -0.17473348 0.18743411 0.06488031
## slope -0.16910511 -0.02666629 0.13163278 -0.12044531 -0.01424787
## ca 0.27155053 0.11172891 -0.17620647 0.10455372 0.07425934
## thal 0.07224456 0.20211721 -0.16985403 0.05894881 0.09552541
## fbs restecg thalach exang oldpeak
## age 0.121243479 -0.13269617 -0.390227075 0.08816338 0.20813668
## sex 0.027200461 -0.05511721 -0.049365243 0.13915681 0.08468656
## cp 0.079293586 0.04358061 0.306839282 -0.40151271 -0.17473348
## trestbps 0.181766624 -0.12379409 -0.039264069 0.06119697 0.18743411
## chol 0.026917164 -0.14741024 -0.021772091 0.06738223 0.06488031
## fbs 1.000000000 -0.10405124 -0.008865857 0.04926057 0.01085948
## restecg -0.104051244 1.00000000 0.048410637 -0.06560553 -0.05011425
## thalach -0.008865857 0.04841064 1.000000000 -0.38028087 -0.34979616
## exang 0.049260570 -0.06560553 -0.380280872 1.00000000 0.31084376
## oldpeak 0.010859481 -0.05011425 -0.349796163 0.31084376 1.00000000
## slope -0.061902374 0.08608609 0.395307843 -0.26733547 -0.57518854
## ca 0.137156259 -0.07807235 -0.207888416 0.10784854 0.22181603
## thal -0.030129129 -0.02030400 -0.106701873 0.20958207 0.20473483
## slope ca thal
## age -0.16910511 0.27155053 0.07224456
## sex -0.02666629 0.11172891 0.20211721
## cp 0.13163278 -0.17620647 -0.16985403
## trestbps -0.12044531 0.10455372 0.05894881
## chol -0.01424787 0.07425934 0.09552541
## fbs -0.06190237 0.13715626 -0.03012913
## restecg 0.08608609 -0.07807235 -0.02030400
## thalach 0.39530784 -0.20788842 -0.10670187
## exang -0.26733547 0.10784854 0.20958207
## oldpeak -0.57518854 0.22181603 0.20473483
## slope 1.00000000 -0.07344041 -0.09650143
## ca -0.07344041 1.00000000 0.14576170
## thal -0.09650143 0.14576170 1.00000000
# Splitting the data for females and males in order to find the most important factor
# leading to heart disease in each gender.
#Create a subset for males only.
Male_Data <- subset(heart, sex==1)
Male_Data
## # A tibble: 312 × 15
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 62 1 1 138 294 2 2 106 1 1.9 2
## 2 58 1 1 100 248 1 1 122 1 1 2
## 3 71 1 1 112 149 1 2 125 1 1.6 2
## 4 43 1 1 132 341 2 1 136 2 3 2
## 5 34 1 2 118 210 1 2 192 1 0.7 3
## 6 34 1 2 118 210 1 2 192 1 0.7 3
## 7 51 1 3 140 308 1 1 142 1 1.5 3
## 8 50 1 2 120 244 1 2 162 1 1.1 3
## 9 67 1 1 106 223 1 2 142 1 0.3 3
## 10 63 1 3 135 252 1 1 172 1 0 3
## # … with 302 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## # target <fct>, pvalues <dbl>
#Create another subset for females only.
Female_Data <- subset(heart, sex != 1)
Female_Data
## # A tibble: 713 × 15
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 2 1 125 212 1 2 168 1 1 3
## 2 53 2 1 140 203 2 1 155 2 3.1 1
## 3 70 2 1 145 174 1 2 125 2 2.6 1
## 4 61 2 1 148 203 1 2 161 1 0 3
## 5 58 2 1 114 318 1 3 140 1 4.4 1
## 6 55 2 1 160 289 1 1 145 2 0.8 2
## 7 46 2 1 120 249 1 1 144 1 0.8 3
## 8 54 2 1 122 286 1 1 116 2 3.2 2
## 9 51 2 1 140 298 1 2 122 2 4.2 2
## 10 52 2 1 128 204 2 2 156 2 1 2
## # … with 703 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## # target <fct>, pvalues <dbl>
# converting the target column into a factor of 2 groups
Male_Data$target <- as.factor(Male_Data$target)
Female_Data$target <- as.factor(Female_Data$target)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:mosaic':
##
## dotPlot
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:purrr':
##
## lift
#Feature selection using random forest technique
Feature_Importance_Males = randomForest(target~., data=Male_Data)
# Create an importance based on mean decreasing gini
importance(Feature_Importance_Males)
## MeanDecreaseGini
## age 11.924009
## sex 0.000000
## cp 12.288118
## trestbps 7.936674
## chol 6.631398
## fbs 1.247778
## restecg 2.722251
## thalach 8.177346
## exang 10.096695
## oldpeak 16.625298
## slope 7.774846
## ca 10.629276
## thal 17.299322
## pvalues 9.647001
varImp(Feature_Importance_Males)
## Overall
## age 11.924009
## sex 0.000000
## cp 12.288118
## trestbps 7.936674
## chol 6.631398
## fbs 1.247778
## restecg 2.722251
## thalach 8.177346
## exang 10.096695
## oldpeak 16.625298
## slope 7.774846
## ca 10.629276
## thal 17.299322
## pvalues 9.647001
varImpPlot(Feature_Importance_Males, col= "red", pch= 20)

Feature_Importance_Females = randomForest(target~., data=Female_Data)
# Create an importance based on mean decreasing gini
importance(Feature_Importance_Females)
## MeanDecreaseGini
## age 32.344729
## sex 0.000000
## cp 45.878216
## trestbps 24.185781
## chol 28.093235
## fbs 3.764119
## restecg 5.973994
## thalach 46.295532
## exang 10.672837
## oldpeak 37.299017
## slope 15.019152
## ca 45.620489
## thal 20.668602
## pvalues 27.953655
varImp(Feature_Importance_Females)
## Overall
## age 32.344729
## sex 0.000000
## cp 45.878216
## trestbps 24.185781
## chol 28.093235
## fbs 3.764119
## restecg 5.973994
## thalach 46.295532
## exang 10.672837
## oldpeak 37.299017
## slope 15.019152
## ca 45.620489
## thal 20.668602
## pvalues 27.953655
varImpPlot(Feature_Importance_Females, col= "red", pch= 20)

# 4) Hypothesis
# H0 = There is no association between chest pain and heart disease diagnosis
# HA = There is association between chest pain and heart disease diagnosis
qqnorm(heart$age)
qqline(heart$age)

# Scatterplot
library(ggplot2)
gg <- ggplot(heart, aes(x=chol, y=trestbps)) +
geom_point(aes(col=target, size=oldpeak)) +
geom_smooth(method="loess", se=F) +
xlim(c(100, 430)) +
ylim(c(75, 200)) +
labs(subtitle="trestbps Vs chol",
y="trestbps",
x="chol",
title="Scatterplot",
caption = "Source: midwest",
bins = 30)
plot(gg)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values (`geom_point()`).

# We will try logistic regression to predict which patients have heart disease.
# Logistic Regression Prediction
library(caTools)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(ROCR)
library(Information)
##
## Attaching package: 'Information'
##
## The following object is masked from 'package:dials':
##
## penalty
library(OptimalCutpoints)
library(pROC)
library(caret)
library(ggplot2)
library(lattice)
#Split the Data to training and testing data to conduct a logistic regression model
set.seed(123)
split=sample.split(heart$target, SplitRatio = 0.75)
Train_Data=subset(heart,split == TRUE)
Test_Data=subset(heart,split == FALSE)
# Fit a logistic regression model: We will fit a logistic regression model using the "glm"
# function in R.
Log_model <- glm(target ~., data=Train_Data, family = "binomial")
summary(Log_model)
##
## Call:
## glm(formula = target ~ ., family = "binomial", data = Train_Data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4807 -0.3794 0.1204 0.6027 2.7168
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.151504 1.921100 3.202 0.001364 **
## age -0.007024 0.014762 -0.476 0.634226
## sex -1.770169 0.293189 -6.038 1.56e-09 ***
## cp 0.837972 0.115954 7.227 4.95e-13 ***
## trestbps -0.020891 0.006786 -3.079 0.002078 **
## chol -0.005026 0.002361 -2.129 0.033240 *
## fbs -0.432964 0.340185 -1.273 0.203113
## restecg 0.451292 0.218007 2.070 0.038444 *
## thalach 0.026793 0.006557 4.086 4.39e-05 ***
## exang -1.025174 0.269484 -3.804 0.000142 ***
## oldpeak -0.599619 0.137488 -4.361 1.29e-05 ***
## slope 0.398782 0.226414 1.761 0.078189 .
## ca -0.758476 0.122161 -6.209 5.34e-10 ***
## thal -1.015182 0.186298 -5.449 5.06e-08 ***
## pvalues -1.144755 1.780704 -0.643 0.520311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1064.15 on 767 degrees of freedom
## Residual deviance: 546.33 on 753 degrees of freedom
## AIC: 576.33
##
## Number of Fisher Scoring iterations: 6
# Model evaluation: We will evaluate the performance of the model using various metrics such as
# accuracy, sensitivity, specificity, and ROC curve.
# Make predictions on test data
predictions <- predict(Log_model, newdata = Test_Data, type = "response")
# Measure the accuracy of prediction in the test data
# The common practice is to take the probability cutoff as 0.5.
# If the probability of Y is > 0.5, then it can be classified an event (presence of heart disease).
# So if pred is greater than 0.5, it is positive(heart disease =yes) else it is negative
y_pred_num <- ifelse(predictions > 0.5, 1, 0)
y_pred <- factor(y_pred_num, levels=c(0, 1))
y_act <- Test_Data$target
# Result : Prediction Accuracy (Proportion of predicted target that matches with actual target)
mean(y_pred == y_act)
## [1] 0.8677043
# 0.8754864
# Calculate the optimal threshold
pred <- prediction(predictions, Test_Data$target)
perf <- performance(pred, "tpr", "fpr")
thresholds <- perf@alpha.values[[1]]
# Plot the ROC curve
plot(perf)
# Add the optimal threshold point to the ROC curve
abline(a = 0, b = 1)
points(thresholds, thresholds, col = "red", pch = 19)

# Get p-values for all features
summary(Log_model)$coef[, "Pr(>|z|)"]
## (Intercept) age sex cp trestbps chol
## 1.364422e-03 6.342264e-01 1.563796e-09 4.946596e-13 2.078266e-03 3.324016e-02
## fbs restecg thalach exang oldpeak slope
## 2.031131e-01 3.844444e-02 4.388141e-05 1.422598e-04 1.293311e-05 7.818903e-02
## ca thal pvalues
## 5.338421e-10 5.058450e-08 5.203109e-01
# Suppose you have a vector of p-values in scientific notation
p_values <- c(601880e-03, 6.525117e-01, 1.709865e-09, 3.400166e-13, 2.303392e-03, 3.193722e-02,
2.370040e-01, 4.173028e-02, 4.954128e-05, 1.587542e-04, 1.448314e-05, 8.981884e-02,
4.013419e-10, 5.346086e-08 )
# Use format() to convert to normal form
formatted_p_values <- format(p_values, scientific = FALSE)
# View the result
formatted_p_values
## [1] "601.8799999999999954525" " 0.6525117000000000278"
## [3] " 0.0000000017098650000" " 0.0000000000003400166"
## [5] " 0.0023033920000000000" " 0.0319372200000000023"
## [7] " 0.2370039999999999925" " 0.0417302800000000015"
## [9] " 0.0000495412800000000" " 0.0001587542000000000"
## [11] " 0.0000144831400000000" " 0.0898188399999999970"
## [13] " 0.0000000004013419000" " 0.0000000534608600000"
# The p-value obtained for the coefficient of the chest pain variable (cp) in the logistic
# regression model indicates the statistical significance of the association between
# chest pain and heart disease diagnosis.
# The p-value for cp is given as "3.40e-13" which is very small (less than 0.05),
# indicating strong evidence against the null hypothesis that there is no association
# between chest pain and heart disease diagnosis.
# Therefore, based on the p-value obtained, we can reject the null hypothesis and conclude
# that there is a significant association between chest pain and heart disease diagnosis.
predictTrain = predict(Log_model, type='response')
#Confusion matrix using threshold of 0.5
table(Train_Data$target, predictTrain>0.5)
##
## FALSE TRUE
## 0 300 74
## 1 35 359
# FALSE TRUE
# 0 301 73
# 1 35 359
# The table shows the number of instances that fall into each combination of actual and predicted classes.
# In this case, there were 301 instances where the actual target was 0 and the model
# predicted 0 (true negative), 73 instances where the actual target was 0 and the
# model predicted 1 (false positive), 35 instances where the actual target was 1 and the
# model predicted 0 (false negative), and 359 instances where the actual target was 1 and
# the model predicted 1 (true positive).
# True Negative (TN): 301 - The number of instances where the model correctly predicted the
# absence of heart disease (actual target = 0)
# False Positive (FP): 73 - The number of instances where the model predicted the presence
# of heart disease (actual target = 0)
# False Negative (FN): 35 - The number of instances where the model predicted the absence
# of heart disease (actual target = 1)
# True Positive (TP): 359 - The number of instances where the model correctly predicted the
# presence of heart disease (actual target = 1)
#Calculate the accuracy on the training set
(301+359)/nrow(Train_Data) # 0.859375
## [1] 0.859375
#Predictions on Test set
predictTest = predict(Log_model, newdata=Test_Data, type='response')
#Confusion matrix using threshold of 0.5
table(Test_Data$target, predictTest>0.5)
##
## FALSE TRUE
## 0 103 22
## 1 12 120
# FALSE TRUE
# 0 105 20
# 1 12 120
# Anova test
anova(Log_model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: target
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 767 1064.15
## age 1 40.290 766 1023.86 2.189e-10 ***
## sex 1 80.482 765 943.38 < 2.2e-16 ***
## cp 1 142.059 764 801.32 < 2.2e-16 ***
## trestbps 1 17.529 763 783.79 2.830e-05 ***
## chol 1 5.270 762 778.52 0.02170 *
## fbs 1 3.061 761 775.46 0.08019 .
## restecg 1 2.469 760 772.99 0.11610
## thalach 1 71.812 759 701.18 < 2.2e-16 ***
## exang 1 25.360 758 675.82 4.756e-07 ***
## oldpeak 1 50.334 757 625.49 1.297e-12 ***
## slope 1 0.343 756 625.14 0.55791
## ca 1 48.249 755 576.90 3.755e-12 ***
## thal 1 30.164 754 546.73 3.969e-08 ***
## pvalues 1 0.400 753 546.33 0.52707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA (Analysis of Variance) test is used to determine whether there are significant differences
# between the means of two or more groups. In the context of logistic regression, an ANOVA test
# is used to determine if there is a significant difference in the deviance between two or more
# nested models.
#
# In the output, we can see the ANOVA table for the logistic regression model Log_model.
# The table shows the change in deviance for each variable added to the model, and the
# p-value associated with that change in deviance. The null hypothesis is that the model
# without the variable is not significantly different from the model with the variable.
# The alternative hypothesis is that the model with the variable is significantly better
# than the model without the variable.
#
# In this case, we can see that all the variables are statistically significant except
# for slope, restecg, and fbs (which has a p-value of 0.65). This suggests that the
# model is a good fit for the data and that all the variables are important in predicting
# the target variable.
# t-test
# t-test is statistical method used to determine the significant difference between the means of two groups.
# t-test to confirm the association between chest pain and heart disease
ttest_cp <- t.test(heart$cp ~ heart$target, var.equal= TRUE)
ttest_cp
##
## Two Sample t-test
##
## data: heart$cp by heart$target
## t = -15.445, df = 1023, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -1.009114 -0.781608
## sample estimates:
## mean in group 0 mean in group 1
## 1.482966 2.378327
# The output shows the test statistic t, which is -15.445, the degrees of freedom (df) which
# is 1023, and the p-value which is less than 2.2e-16, indicating strong evidence against
# the null hypothesis. The null hypothesis is that there is no significant difference in
# the means of chest pain between individuals with and without heart disease.
# The alternative hypothesis is that there is a significant difference in the means of
# chest pain between the two groups.
#
# The 95% confidence interval is also shown, which is -1.009114 to -0.781608. The sample
# means for each group are given as well, which are 1.482966 for group 0 (no heart disease)
# and 2.378327 for group 1 (heart disease).
#
# Therefore, the results of the t-test indicate a significant difference in the means of
# chest pain between individuals with and without heart disease. Individuals with heart
# disease tend to have a higher mean chest pain score than those without heart disease.
# Chi-test
# let's perform chi-squared test of independence to investigate the association between two
# categorical variables, chest pain (heart$cp) and heart disease diagnosis (heart$target).
CHI_cp <- chisq.test(heart$cp, heart$target)
# Print the results to see if p<0.05.
print(CHI_cp)
##
## Pearson's Chi-squared test
##
## data: heart$cp and heart$target
## X-squared = 280.98, df = 3, p-value < 2.2e-16
# The p-value is less than 2.2e-16, which indicates strong evidence against the null hypothesis
# of independence. Therefore, we can conclude that there is a significant association between
# chest pain and heart disease diagnosis.
# Calculate the confusion matrix
conf_matrix <- confusionMatrix(y_pred, y_act)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 103 12
## 1 22 120
##
## Accuracy : 0.8677
## 95% CI : (0.8201, 0.9066)
## No Information Rate : 0.5136
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7346
##
## Mcnemar's Test P-Value : 0.1227
##
## Sensitivity : 0.8240
## Specificity : 0.9091
## Pos Pred Value : 0.8957
## Neg Pred Value : 0.8451
## Prevalence : 0.4864
## Detection Rate : 0.4008
## Detection Prevalence : 0.4475
## Balanced Accuracy : 0.8665
##
## 'Positive' Class : 0
##
# Calculate the accuracy
accuracy <- conf_matrix$overall[["Accuracy"]]
print(paste0("Accuracy: ", accuracy))
## [1] "Accuracy: 0.867704280155642"
# Compute metrics
sensitivity <- conf_matrix$byClass["Sensitivity"]
print(sensitivity) # 0.84
## Sensitivity
## 0.824
specificity <- conf_matrix$byClass["Specificity"]
print(specificity) # 0.9090909
## Specificity
## 0.9090909
# ROC curve
library(pROC)
roc_data <- roc(Test_Data$target, predictions)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_data, col = "blue", print.auc = TRUE, auc.polygon = TRUE, max.auc.polygon = TRUE,
grid=c(0.1,0.2), grid.col=c("green", "yellow"), lwd=2, main="ROC Curve")
legend("bottomright", legend = paste("AUC =", round(auc(roc_data),2)), col = "blue", lty=1, cex=1.5)

# The logistic regression model has a sensitivity of 0.84, which means that the model
# correctly identifies 84% of the people who have heart disease.
# The specificity of the model is 0.909, which means that the model correctly identifies
# 90.9% of the people who do not have heart disease.
# The ROC curve plot shows the tradeoff between sensitivity and specificity for
# different probability thresholds of the model.
# The closer the curve is to the top left corner, the better the model performance.
# The AUC (Area Under the Curve) value is a measure of the model's overall performance,
# where an AUC of 1 represents a perfect model and an AUC of 0.5 represents a random classifier.
# In this case, the AUC is 0.91, which indicates a good performance of the model.
# Overall, the logistic regression model seems to perform well in predicting the
# presence of heart disease in the test data.
# 5) Hypothesis 5
# Is there a significant difference in the mean cholesterol levels between individuals
# with and without heart disease?
# chol: The person’s cholesterol measurement in mg/dl
# Two-sample t-test to compare mean cholesterol levels between individuals with and without heart disease
ttest_chol <- t.test(heart$chol ~ heart$target, var.equal=TRUE)
# Print the results
print(ttest_chol)
##
## Two Sample t-test
##
## data: heart$chol by heart$target
## t = 3.2134, df = 1023, p-value = 0.001353
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 4.015552 16.611444
## sample estimates:
## mean in group 0 mean in group 1
## 251.2926 240.9791
# p-value = 0.001353
# Since this value is less than 0.05, we can reject the null hypothesis and conclude that there
# is a significant difference in the mean cholesterol levels between individuals with and
# without heart disease.
# Factor Analysis
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(magrittr)
library(NbClust)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(dplyr)
library(psych)
##
## Attaching package: 'psych'
##
## The following object is masked from 'package:randomForest':
##
## outlier
##
## The following objects are masked from 'package:mosaic':
##
## logit, rescale
##
## The following objects are masked from 'package:scales':
##
## alpha, rescale
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
heart_data <- heart
heart_data
## # A tibble: 1,025 × 15
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 2 1 125 212 1 2 168 1 1 3
## 2 53 2 1 140 203 2 1 155 2 3.1 1
## 3 70 2 1 145 174 1 2 125 2 2.6 1
## 4 61 2 1 148 203 1 2 161 1 0 3
## 5 62 1 1 138 294 2 2 106 1 1.9 2
## 6 58 1 1 100 248 1 1 122 1 1 2
## 7 58 2 1 114 318 1 3 140 1 4.4 1
## 8 55 2 1 160 289 1 1 145 2 0.8 2
## 9 46 2 1 120 249 1 1 144 1 0.8 3
## 10 54 2 1 122 286 1 1 116 2 3.2 2
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## # target <fct>, pvalues <dbl>
colnames(heart_data)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target" "pvalues"
str(heart_data)
## spc_tbl_ [1,025 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1025] 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : num [1:1025] 2 2 2 2 1 1 2 2 2 2 ...
## $ cp : num [1:1025] 1 1 1 1 1 1 1 1 1 1 ...
## $ trestbps: num [1:1025] 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : num [1:1025] 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : num [1:1025] 1 2 1 1 2 1 1 1 1 1 ...
## $ restecg : num [1:1025] 2 1 2 2 2 1 3 1 1 1 ...
## $ thalach : num [1:1025] 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : num [1:1025] 1 2 2 1 1 1 1 2 1 2 ...
## $ oldpeak : num [1:1025] 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : num [1:1025] 3 1 1 3 2 2 1 2 3 2 ...
## $ ca : num [1:1025] 3 1 1 2 4 1 4 2 1 3 ...
## $ thal : num [1:1025] 3 3 3 3 2 2 1 3 3 2 ...
## $ target : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## $ pvalues : num [1:1025] 0.055856 0.000377 0.001197 0.024796 0.000302 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_double(),
## .. cp = col_double(),
## .. trestbps = col_double(),
## .. chol = col_double(),
## .. fbs = col_double(),
## .. restecg = col_double(),
## .. thalach = col_double(),
## .. exang = col_double(),
## .. oldpeak = col_double(),
## .. slope = col_double(),
## .. ca = col_double(),
## .. thal = col_double(),
## .. target = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
attach(heart_data)
## The following objects are masked from heart:
##
## age, ca, chol, cp, exang, fbs, oldpeak, pvalues, restecg, sex,
## slope, target, thal, thalach, trestbps
#heart_data[1:6]
heart_data
## # A tibble: 1,025 × 15
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 52 2 1 125 212 1 2 168 1 1 3
## 2 53 2 1 140 203 2 1 155 2 3.1 1
## 3 70 2 1 145 174 1 2 125 2 2.6 1
## 4 61 2 1 148 203 1 2 161 1 0 3
## 5 62 1 1 138 294 2 2 106 1 1.9 2
## 6 58 1 1 100 248 1 1 122 1 1 2
## 7 58 2 1 114 318 1 3 140 1 4.4 1
## 8 55 2 1 160 289 1 1 145 2 0.8 2
## 9 46 2 1 120 249 1 1 144 1 0.8 3
## 10 54 2 1 122 286 1 1 116 2 3.2 2
## # … with 1,015 more rows, and 4 more variables: ca <dbl>, thal <dbl>,
## # target <fct>, pvalues <dbl>
heart_data$target <- as.numeric(heart_data$target)
fit.pc <- principal(heart_data, nfactors=4, rotate="varimax")
fit.pc
## Principal Components Analysis
## Call: principal(r = heart_data, nfactors = 4, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
## RC1 RC4 RC2 RC3 h2 u2 com
## age -0.30 0.08 0.41 0.49 0.51 0.49 2.7
## sex 0.06 0.47 0.14 -0.65 0.66 0.34 2.0
## cp 0.24 -0.62 0.27 -0.21 0.56 0.44 2.0
## trestbps -0.15 -0.08 0.54 0.25 0.39 0.61 1.6
## chol 0.10 0.19 0.11 0.73 0.60 0.40 1.2
## fbs 0.01 -0.09 0.68 -0.13 0.49 0.51 1.1
## restecg -0.08 -0.20 -0.38 -0.19 0.22 0.78 2.2
## thalach 0.67 -0.26 -0.02 -0.15 0.54 0.46 1.4
## exang -0.46 0.50 -0.04 0.00 0.46 0.54 2.0
## oldpeak -0.74 0.17 0.12 -0.01 0.59 0.41 1.2
## slope 0.82 0.03 -0.09 0.05 0.68 0.32 1.0
## ca -0.07 0.45 0.43 0.06 0.39 0.61 2.1
## thal -0.02 0.61 0.07 -0.05 0.38 0.62 1.0
## target 0.43 -0.70 -0.16 0.00 0.70 0.30 1.8
## pvalues 0.39 -0.12 -0.54 0.07 0.47 0.53 2.0
##
## RC1 RC4 RC2 RC3
## SS loadings 2.41 2.11 1.71 1.40
## Proportion Var 0.16 0.14 0.11 0.09
## Cumulative Var 0.16 0.30 0.42 0.51
## Proportion Explained 0.32 0.28 0.22 0.18
## Cumulative Proportion 0.32 0.59 0.82 1.00
##
## Mean item complexity = 1.7
## Test of the hypothesis that 4 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.09
## with the empirical chi square 1653.05 with prob < 1.1e-312
##
## Fit based upon off diagonal values = 0.81
round(fit.pc$values, 3)
## [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
fit.pc$loadings
##
## Loadings:
## RC1 RC4 RC2 RC3
## age -0.302 0.414 0.491
## sex 0.469 0.139 -0.645
## cp 0.243 -0.621 0.272 -0.213
## trestbps -0.148 0.544 0.254
## chol 0.191 0.113 0.734
## fbs 0.681 -0.134
## restecg -0.195 -0.379 -0.191
## thalach 0.671 -0.261 -0.148
## exang -0.461 0.495
## oldpeak -0.738 0.166 0.125
## slope 0.819
## ca 0.447 0.429
## thal 0.609
## target 0.429 -0.697 -0.164
## pvalues 0.394 -0.125 -0.540
##
## RC1 RC4 RC2 RC3
## SS loadings 2.415 2.114 1.712 1.397
## Proportion Var 0.161 0.141 0.114 0.093
## Cumulative Var 0.161 0.302 0.416 0.509
# Loadings with more digits
for (i in c(1,3,2,4)) { print(fit.pc$loadings[[1,i]])}
## [1] -0.301812
## [1] 0.4141713
## [1] 0.07793931
## [1] 0.4912733
# Communalities
fit.pc$communality
## age sex cp trestbps chol fbs restecg thalach
## 0.5100523 0.6583869 0.5641369 0.3878055 0.5973798 0.4891392 0.2248950 0.5400533
## exang oldpeak slope ca thal target pvalues
## 0.4592026 0.5884998 0.6814477 0.3940923 0.3780577 0.6967032 0.4678972
# Rotated factor scores, Notice the columns ordering: RC1, RC3, RC2 and RC4
# fit.pc$scores
head(fit.pc$scores)
## RC1 RC4 RC2 RC3
## [1,] 0.9211371 1.43470866 -0.2316238 -0.6701889
## [2,] -1.5292302 0.40231673 1.1074114 -1.3275474
## [3,] -2.4316832 0.25523494 -0.2985576 -0.6532033
## [4,] 0.8354094 1.06924634 0.2405381 -0.2802802
## [5,] -1.0186815 -0.08943952 1.3159179 1.2129655
## [6,] -0.7248101 -0.72738032 -1.0491437 1.0132839
# Play with FA utilities
fa.parallel(heart_data) # See factor recommendation

## Parallel analysis suggests that the number of factors = 6 and the number of components = 4
fa.plot(fit.pc) # See Correlations within Factors

fa.diagram(fit.pc) # Visualize the relationship

vss(heart_data) # See Factor recommendations for a simple structure

##
## Very Simple Structure
## Call: vss(x = heart_data)
## VSS complexity 1 achieves a maximimum of 0.51 with 1 factors
## VSS complexity 2 achieves a maximimum of 0.64 with 8 factors
##
## The Velicer MAP achieves a minimum of 0.02 with 1 factors
## BIC achieves a minimum of -99.28 with 6 factors
## Sample Size adjusted BIC achieves a minimum of -26.21 with 7 factors
##
## Statistics by number of factors
## vss1 vss2 map dof chisq prob sqresid fit RMSEA BIC SABIC complex
## 1 0.51 0.00 0.018 90 1110 2.3e-175 11.7 0.51 0.105 486 772 1.0
## 2 0.48 0.59 0.021 76 786 1.4e-118 9.6 0.59 0.095 260 501 1.3
## 3 0.43 0.60 0.028 63 493 5.0e-68 8.4 0.64 0.082 57 257 1.5
## 4 0.44 0.63 0.034 51 322 5.9e-41 7.1 0.70 0.072 -32 130 1.6
## 5 0.43 0.62 0.042 40 198 8.2e-23 6.4 0.73 0.062 -79 48 1.7
## 6 0.41 0.61 0.057 30 109 7.5e-11 5.8 0.76 0.051 -99 -4 1.9
## 7 0.46 0.63 0.075 21 53 1.5e-04 4.9 0.79 0.038 -93 -26 1.8
## 8 0.49 0.64 0.101 13 31 3.1e-03 4.3 0.82 0.037 -59 -18 1.9
## eChisq SRMR eCRMS eBIC
## 1 1506 0.084 0.090 882
## 2 848 0.063 0.074 321
## 3 541 0.050 0.065 104
## 4 290 0.037 0.053 -64
## 5 172 0.028 0.046 -105
## 6 81 0.019 0.036 -127
## 7 38 0.013 0.030 -107
## 8 17 0.009 0.026 -73
# Computing Correlation Matrix
corrm.emp <- cor(heart_data)
corrm.emp
## age sex cp trestbps chol
## age 1.00000000 -0.10324030 -0.07196627 0.27112141 0.21982253
## sex -0.10324030 1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp -0.07196627 -0.04111909 1.00000000 0.03817742 -0.08164102
## trestbps 0.27112141 -0.07897377 0.03817742 1.00000000 0.12797743
## chol 0.21982253 -0.19825787 -0.08164102 0.12797743 1.00000000
## fbs 0.12124348 0.02720046 0.07929359 0.18176662 0.02691716
## restecg -0.13269617 -0.05511721 0.04358061 -0.12379409 -0.14741024
## thalach -0.39022708 -0.04936524 0.30683928 -0.03926407 -0.02177209
## exang 0.08816338 0.13915681 -0.40151271 0.06119697 0.06738223
## oldpeak 0.20813668 0.08468656 -0.17473348 0.18743411 0.06488031
## slope -0.16910511 -0.02666629 0.13163278 -0.12044531 -0.01424787
## ca 0.27155053 0.11172891 -0.17620647 0.10455372 0.07425934
## thal 0.07224456 0.20211721 -0.16985403 0.05894881 0.09552541
## target -0.22932355 -0.27950076 0.43485425 -0.13877173 -0.09996559
## pvalues -0.24580364 -0.06777006 0.06640377 -0.19108407 -0.03900630
## fbs restecg thalach exang oldpeak
## age 0.121243479 -0.13269617 -0.390227075 0.08816338 0.20813668
## sex 0.027200461 -0.05511721 -0.049365243 0.13915681 0.08468656
## cp 0.079293586 0.04358061 0.306839282 -0.40151271 -0.17473348
## trestbps 0.181766624 -0.12379409 -0.039264069 0.06119697 0.18743411
## chol 0.026917164 -0.14741024 -0.021772091 0.06738223 0.06488031
## fbs 1.000000000 -0.10405124 -0.008865857 0.04926057 0.01085948
## restecg -0.104051244 1.00000000 0.048410637 -0.06560553 -0.05011425
## thalach -0.008865857 0.04841064 1.000000000 -0.38028087 -0.34979616
## exang 0.049260570 -0.06560553 -0.380280872 1.00000000 0.31084376
## oldpeak 0.010859481 -0.05011425 -0.349796163 0.31084376 1.00000000
## slope -0.061902374 0.08608609 0.395307843 -0.26733547 -0.57518854
## ca 0.137156259 -0.07807235 -0.207888416 0.10784854 0.22181603
## thal -0.030129129 -0.02030400 -0.106701873 0.20958207 0.20473483
## target -0.041163547 0.13446821 0.422895496 -0.43802855 -0.43844127
## pvalues -0.243712291 0.11754954 0.267218689 -0.27484412 -0.26075671
## slope ca thal target pvalues
## age -0.16910511 0.27155053 0.07224456 -0.22932355 -0.24580364
## sex -0.02666629 0.11172891 0.20211721 -0.27950076 -0.06777006
## cp 0.13163278 -0.17620647 -0.16985403 0.43485425 0.06640377
## trestbps -0.12044531 0.10455372 0.05894881 -0.13877173 -0.19108407
## chol -0.01424787 0.07425934 0.09552541 -0.09996559 -0.03900630
## fbs -0.06190237 0.13715626 -0.03012913 -0.04116355 -0.24371229
## restecg 0.08608609 -0.07807235 -0.02030400 0.13446821 0.11754954
## thalach 0.39530784 -0.20788842 -0.10670187 0.42289550 0.26721869
## exang -0.26733547 0.10784854 0.20958207 -0.43802855 -0.27484412
## oldpeak -0.57518854 0.22181603 0.20473483 -0.43844127 -0.26075671
## slope 1.00000000 -0.07344041 -0.09650143 0.34551175 0.27668649
## ca -0.07344041 1.00000000 0.14576170 -0.38208529 -0.26031799
## thal -0.09650143 0.14576170 1.00000000 -0.35128336 -0.11977944
## target 0.34551175 -0.38208529 -0.35128336 1.00000000 0.27596524
## pvalues 0.27668649 -0.26031799 -0.11977944 0.27596524 1.00000000
plot(corrm.emp)

heart_data_pca <- prcomp(heart_data, scale=TRUE)
summary(heart_data_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.8812 1.2761 1.12290 1.09975 1.00187 0.9836 0.94434
## Proportion of Variance 0.2359 0.1086 0.08406 0.08063 0.06692 0.0645 0.05945
## Cumulative Proportion 0.2359 0.3445 0.42855 0.50918 0.57610 0.6406 0.70005
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.88042 0.86359 0.82302 0.79486 0.70584 0.65941 0.6087
## Proportion of Variance 0.05168 0.04972 0.04516 0.04212 0.03321 0.02899 0.0247
## Cumulative Proportion 0.75172 0.80144 0.84660 0.88872 0.92193 0.95092 0.9756
## PC15
## Standard deviation 0.60465
## Proportion of Variance 0.02437
## Cumulative Proportion 1.00000
plot(heart_data_pca)

# A table containing eigenvalues and %'s accounted, follows. Eigenvalues are the sdev^2
(eigen_heart_data <- round(heart_data_pca$sdev^2,3))
## [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
round(fit.pc$values, 3)
## [1] 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498
## [13] 0.435 0.371 0.366
names(eigen_heart_data) <- paste("PC",1:14,sep="")
eigen_heart_data
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13
## 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677 0.632 0.498 0.435
## PC14 <NA>
## 0.371 0.366
sumlambdas <- sum(eigen_heart_data)
sumlambdas
## [1] 15
propvar <- round(eigen_heart_data/sumlambdas,2)
propvar
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 <NA>
## 0.24 0.11 0.08 0.08 0.07 0.06 0.06 0.05 0.05 0.05 0.04 0.03 0.03 0.02 0.02
cumvar_heart_data <- cumsum(propvar)
cumvar_heart_data
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 <NA>
## 0.24 0.35 0.43 0.51 0.58 0.64 0.70 0.75 0.80 0.85 0.89 0.92 0.95 0.97 0.99
matlambdas <- rbind(eigen_heart_data,propvar,cumvar_heart_data)
matlambdas
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## eigen_heart_data 3.539 1.628 1.261 1.209 1.004 0.967 0.892 0.775 0.746 0.677
## propvar 0.240 0.110 0.080 0.080 0.070 0.060 0.060 0.050 0.050 0.050
## cumvar_heart_data 0.240 0.350 0.430 0.510 0.580 0.640 0.700 0.750 0.800 0.850
## PC11 PC12 PC13 PC14 <NA>
## eigen_heart_data 0.632 0.498 0.435 0.371 0.366
## propvar 0.040 0.030 0.030 0.020 0.020
## cumvar_heart_data 0.890 0.920 0.950 0.970 0.990
rownames(matlambdas) <- c("Eigenvalues","Prop. variance","Cum. prop. variance")
rownames(matlambdas)
## [1] "Eigenvalues" "Prop. variance" "Cum. prop. variance"
eigvec.emp <- heart_data_pca$rotation
print(heart_data_pca)
## Standard deviations (1, .., p=15):
## [1] 1.8812235 1.2760824 1.1229031 1.0997498 1.0018657 0.9835860 0.9443355
## [8] 0.8804198 0.8635916 0.8230152 0.7948601 0.7058415 0.6594143 0.6087487
## [15] 0.6046523
##
## Rotation (n x k) = (15 x 15):
## PC1 PC2 PC3 PC4 PC5
## age 0.24984321 -0.393171325 0.15361807 -0.079606227 0.265670994
## sex 0.10553952 0.361464951 -0.56292407 -0.074125500 -0.114973076
## cp -0.25290181 -0.297681988 -0.27233425 0.287501040 -0.214844981
## trestbps 0.14937255 -0.428285805 -0.08966072 0.003320797 -0.287626472
## chol 0.09471237 -0.303739885 0.36580424 -0.451616085 -0.252621540
## fbs 0.08323174 -0.352032612 -0.45349568 0.053831609 0.164408972
## restecg -0.11304826 0.221331435 0.12767202 0.256130125 0.342746434
## thalach -0.35120347 -0.052339607 -0.21840651 -0.179412840 -0.317999665
## exang 0.32415364 0.220331295 0.07806227 -0.022361829 0.005896473
## oldpeak 0.35607178 0.026082571 0.08488007 0.327357009 -0.289582165
## slope -0.31595929 0.003041486 -0.12301238 -0.505501618 0.267645055
## ca 0.25157744 -0.097916696 -0.22954447 -0.269823895 0.408595108
## thal 0.20652409 0.204043741 -0.13527341 -0.335630911 -0.368814119
## target -0.41282284 -0.179816304 0.06652019 0.170951351 0.019739549
## pvalues -0.28299732 0.204971521 0.26456660 -0.151597786 -0.137753273
## PC6 PC7 PC8 PC9 PC10
## age -0.21099100 0.26821531 -0.15155247 -0.408110421 0.19417752
## sex -0.01573767 0.23701769 -0.08262481 -0.168878000 0.26639344
## cp -0.25659179 0.17820935 0.22296981 -0.279683792 -0.02361997
## trestbps -0.18893939 -0.21656930 -0.71243542 0.154434372 -0.05839473
## chol 0.00765153 -0.13595235 0.43397344 -0.001779595 0.16942700
## fbs 0.26728749 -0.40754202 0.19535830 0.076039033 0.53037556
## restecg -0.59801390 -0.52674985 0.04044943 -0.005495562 0.15025462
## thalach -0.06800240 -0.18133486 0.07396642 0.357952185 -0.17614283
## exang 0.34264321 -0.38645630 -0.11687031 -0.140201440 -0.09372719
## oldpeak -0.19547803 0.07698159 0.13307982 0.295776684 0.05430526
## slope -0.03970721 -0.09647195 -0.20922289 -0.130127018 -0.11533282
## ca -0.30228492 0.22576669 0.17314099 0.485363761 -0.16917415
## thal -0.40051798 -0.20120688 0.12910253 -0.363589928 -0.04537439
## target 0.04533678 -0.04349071 0.06854687 -0.151837013 -0.09522989
## pvalues -0.09440683 0.20093115 -0.22985466 0.232388921 0.67747496
## PC11 PC12 PC13 PC14 PC15
## age -0.01932611 -0.047437714 0.537390771 0.119500844 -0.181936765
## sex 0.49687698 -0.200422930 0.081132788 -0.247441644 -0.070742161
## cp 0.19803390 0.501342315 -0.212736537 0.287046735 0.011677345
## trestbps 0.12382502 -0.037320096 -0.223082003 -0.135805308 -0.033488347
## chol 0.46644446 -0.104602836 -0.148672821 -0.104503840 0.008007358
## fbs -0.24642535 -0.007115338 0.014804407 -0.005539555 0.098767188
## restecg 0.24899625 -0.110129987 -0.002956789 0.039985565 -0.061566344
## thalach 0.03474022 -0.078479112 0.575846005 0.251362884 -0.297692793
## exang 0.22884343 0.627432700 0.192151175 0.005228984 -0.218876077
## oldpeak 0.03231800 0.134505256 0.388800475 -0.148989761 0.574044141
## slope 0.10427223 0.182241283 0.121999014 0.039393013 0.642222782
## ca -0.01372112 0.307189037 -0.116035804 -0.211876051 -0.212706280
## thal -0.52003280 0.053121250 -0.053353679 -0.137868206 -0.042544817
## target -0.04629167 0.116613648 0.188933508 -0.812507195 -0.104068366
## pvalues -0.14012863 0.346124807 -0.026412846 -0.008674171 -0.103143000
# Taking the first four PCs to generate linear combinations for all the variables with four factors
pcafactors.emp <- eigvec.emp[,1:4]
pcafactors.emp
## PC1 PC2 PC3 PC4
## age 0.24984321 -0.393171325 0.15361807 -0.079606227
## sex 0.10553952 0.361464951 -0.56292407 -0.074125500
## cp -0.25290181 -0.297681988 -0.27233425 0.287501040
## trestbps 0.14937255 -0.428285805 -0.08966072 0.003320797
## chol 0.09471237 -0.303739885 0.36580424 -0.451616085
## fbs 0.08323174 -0.352032612 -0.45349568 0.053831609
## restecg -0.11304826 0.221331435 0.12767202 0.256130125
## thalach -0.35120347 -0.052339607 -0.21840651 -0.179412840
## exang 0.32415364 0.220331295 0.07806227 -0.022361829
## oldpeak 0.35607178 0.026082571 0.08488007 0.327357009
## slope -0.31595929 0.003041486 -0.12301238 -0.505501618
## ca 0.25157744 -0.097916696 -0.22954447 -0.269823895
## thal 0.20652409 0.204043741 -0.13527341 -0.335630911
## target -0.41282284 -0.179816304 0.06652019 0.170951351
## pvalues -0.28299732 0.204971521 0.26456660 -0.151597786
# Multiplying each column of the eigenvector’s matrix by the square-root of the corresponding eigenvalue in order to get the factor loadings
unrot.fact.emp <- sweep(pcafactors.emp,MARGIN=2,heart_data_pca$sdev[1:4],`*`)
unrot.fact.emp
## PC1 PC2 PC3 PC4
## age 0.4700109 -0.501719027 0.17249821 -0.087546934
## sex 0.1985434 0.461259080 -0.63210918 -0.081519506
## cp -0.4757648 -0.379866760 -0.30580497 0.316179218
## trestbps 0.2810032 -0.546527999 -0.10068030 0.003652046
## chol 0.1781751 -0.387597137 0.41076271 -0.496664710
## fbs 0.1565775 -0.449222637 -0.50923170 0.059201302
## restecg -0.2126691 0.282437160 0.14336330 0.281679060
## thalach -0.6606922 -0.066789653 -0.24524935 -0.197309240
## exang 0.6098054 0.281160899 0.08765636 -0.024592418
## oldpeak 0.6698506 0.033283511 0.09531209 0.360010813
## slope -0.5943900 0.003881187 -0.13813098 -0.555925315
## ca 0.4732734 -0.124949777 -0.25775619 -0.296738781
## thal 0.3885180 0.260376637 -0.15189893 -0.369110036
## target -0.7766120 -0.229460429 0.07469573 0.188003718
## pvalues -0.5323812 0.261560560 0.29708266 -0.166719639
# Computing communalities
communalities.emp <- rowSums(unrot.fact.emp^2)
communalities.emp
## age sex cp trestbps chol fbs restecg thalach
## 0.5100523 0.6583869 0.5641369 0.3878055 0.5973798 0.4891392 0.2248950 0.5400533
## exang oldpeak slope ca thal target pvalues
## 0.4592026 0.5884998 0.6814477 0.3940923 0.3780577 0.6967032 0.4678972
# Performing the varimax rotation. The default in the varimax function is norm=TRUE thus, Kaiser normalization is carried out
rot.fact.emp <- varimax(unrot.fact.emp)
#View(unrot.fact.emp)
rot.fact.emp
## $loadings
##
## Loadings:
## PC1 PC2 PC3 PC4
## age 0.302 -0.414 0.491
## sex -0.139 -0.645 -0.469
## cp -0.243 -0.272 -0.213 0.621
## trestbps 0.148 -0.544 0.254
## chol -0.113 0.734 -0.191
## fbs -0.681 -0.134
## restecg 0.379 -0.191 0.195
## thalach -0.671 -0.148 0.261
## exang 0.461 -0.495
## oldpeak 0.738 -0.125 -0.166
## slope -0.819
## ca -0.429 -0.447
## thal -0.609
## target -0.429 0.164 0.697
## pvalues -0.394 0.540 0.125
##
## PC1 PC2 PC3 PC4
## SS loadings 2.415 1.712 1.397 2.114
## Proportion Var 0.161 0.114 0.093 0.141
## Cumulative Var 0.161 0.275 0.368 0.509
##
## $rotmat
## [,1] [,2] [,3] [,4]
## [1,] 0.71782434 -0.3439057 0.1320701 -0.5907745
## [2,] 0.05293757 0.7043005 -0.5337905 -0.4650012
## [3,] 0.27192129 0.6176082 0.7258728 0.1331452
## [4,] 0.63873676 0.0651898 -0.4131996 0.6457799
# The print method of varimax omits loadings less than abs(0.1). In order to display all the loadings, it is necessary to ask explicitly the contents of the object $loadings
fact.load.emp <- rot.fact.emp$loadings[1:14,1:4]
fact.load.emp
## PC1 PC2 PC3 PC4
## age 0.30181198 -0.41417126 0.491273341 -0.07793931
## sex -0.05701621 -0.13912528 -0.645141082 -0.46858635
## cp -0.24282439 -0.27217812 -0.212685649 0.62117397
## trestbps 0.14773462 -0.54350145 0.254253460 0.07707997
## chol -0.09816287 -0.11294735 0.733810412 -0.19107326
## fbs -0.01204250 -0.68088200 -0.133629387 0.08681632
## restecg 0.08119482 0.37896370 -0.191175659 0.19529667
## thalach -0.67051383 0.01584529 -0.148097699 0.26130531
## exang 0.46074471 0.04084030 0.004244965 -0.49520795
## oldpeak 0.73846654 -0.12458929 -0.008871091 -0.16602945
## slope -0.81911287 0.08559617 0.048869743 -0.02805113
## ca 0.07348527 -0.42930031 0.064716409 -0.44744303
## thal 0.01560266 -0.06810639 -0.045418152 -0.60919043
## target -0.42922186 0.16386084 -0.003546873 0.69685639
# Computing the rotated factor scores for the 30 European Countries. Notice that signs are reversed for factors F2 (PC2), F3 (PC3) and F4 (PC4)
scale.emp <- scale(heart_data[1:14])
# scale.emp
head(scale.emp)
## age sex cp trestbps chol fbs
## [1,] -0.2683056 0.6611813 -0.9153086 -0.3774513 -0.65901038 -0.4186735
## [2,] -0.1580799 0.6611813 -0.9153086 0.4788735 -0.83345431 2.3861656
## [3,] 1.7157579 0.6611813 -0.9153086 0.7643151 -1.39555140 -0.4186735
## [4,] 0.7237261 0.6611813 -0.9153086 0.9355801 -0.83345431 -0.4186735
## [5,] 0.8339519 -1.5109689 -0.9153086 0.3646969 0.93036760 2.3861656
## [6,] 0.3930489 -1.5109689 -0.9153086 -1.8046593 0.03876532 -0.4186735
## restecg thalach exang oldpeak slope ca
## [1,] 0.890820 0.8209198 -0.7119396 -0.06085868 0.9949476 1.2086307
## [2,] -1.003559 0.2558430 1.4032432 1.72629436 -2.2425804 -0.7316143
## [3,] 0.890820 -1.0481803 1.4032432 1.30078173 -2.2425804 -0.7316143
## [4,] 0.890820 0.5166477 -0.7119396 -0.91188394 0.9949476 0.2385082
## [5,] 0.890820 -1.8740617 -0.7119396 0.70506405 -0.6238164 2.1787531
## [6,] -1.003559 -1.1785826 -0.7119396 -0.06085868 -0.6238164 -0.7316143
## thal target
## [1,] 1.1150813 -1.0261968
## [2,] 1.1150813 -1.0261968
## [3,] 1.1150813 -1.0261968
## [4,] 1.1150813 -1.0261968
## [5,] -0.5510387 -1.0261968
## [6,] -0.5510387 0.9735213
head(as.matrix(scale.emp)%*%fact.load.emp%*%solve(t(fact.load.emp)%*%fact.load.emp))
## PC1 PC2 PC3 PC4
## [1,] -0.9349143 0.2652563 -0.6565332 -1.4395593
## [2,] 1.6446757 -1.3892331 -1.4419745 -0.3616709
## [3,] 2.4746866 0.1935791 -0.6958274 -0.2400944
## [4,] -0.8848376 -0.1198755 -0.2312880 -1.0866489
## [5,] 1.0805632 -1.4669814 1.1516297 0.1112267
## [6,] 0.5979801 1.3587567 1.1389951 0.6827263
# Different way of PCA Method
library(factoextra)
library(FactoMineR)
library(ggfortify)
## Registered S3 method overwritten by 'ggfortify':
## method from
## autoplot.glmnet parsnip
library(psych)
library(corrplot)
library(devtools)
## Loading required package: usethis
##
## Attaching package: 'devtools'
##
## The following object is masked from 'package:recipes':
##
## check
res.pca <- PCA(heart[,1:13], graph = FALSE)
print(res.pca)
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 1025 individuals, described by 13 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
# Visualize and Interpret PCA using these functions
#get_eigenvalue(res.pca): Extract the eigenvalues/variances of principal components
#fviz_eig(res.pca): Visualize the eigenvalues
#get_pca_ind(res.pca), get_pca_var(res.pca): Extract the results for individuals and variables, respectively.
#fviz_pca_ind(res.pca), fviz_pca_var(res.pca): Visualize the results individuals and variables, respectively.
#fviz_pca_biplot(res.pca): Make a biplot of individuals and variables.
eig.val <- get_eigenvalue(res.pca)
eig.val
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.7830596 21.408151 21.40815
## Dim.2 1.5585672 11.988978 33.39713
## Dim.3 1.2029097 9.253152 42.65028
## Dim.4 1.1669979 8.976907 51.62719
## Dim.5 0.9959175 7.660904 59.28809
## Dim.6 0.9625088 7.403914 66.69201
## Dim.7 0.8783455 6.756504 73.44851
## Dim.8 0.7679664 5.907433 79.35594
## Dim.9 0.7292387 5.609529 84.96547
## Dim.10 0.6315583 4.858141 89.82361
## Dim.11 0.5222838 4.017568 93.84118
## Dim.12 0.4316169 3.320130 97.16131
## Dim.13 0.3690297 2.838690 100.00000
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 50))

var <- get_pca_var(res.pca)
#var$coord: coordinates of variables to create a scatter plot
#var$cos2: represents the quality of representation for variables on the factor map. It’s calculated as the squared coordinates: var.cos2 = var.coord * var.coord.
#var$contrib: contains the contributions (in percentage) of the variables to the principal components.
#The contribution of a variable (var) to a given principal component is (in percentage) : (var.cos2 * 100) / (total cos2 of the component).
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
# Coordinates
head(var$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## age 0.5143487 0.4990651 -0.07634702 0.059195259 -0.32810646
## sex 0.1319762 -0.4752962 0.67194202 -0.003787684 0.04762913
## cp -0.4770016 0.3416973 0.21289872 -0.447663818 0.13043284
## trestbps 0.2968919 0.5471188 0.17124649 -0.144468670 0.21565878
## chol 0.2118213 0.4622542 -0.26740797 0.505462697 0.30811307
## fbs 0.1360315 0.3960127 0.49837684 -0.178310347 -0.16173188
# Cos2: quality on the factore map
head(var$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## age 0.26455456 0.2490659 0.005828868 3.504079e-03 0.107653851
## sex 0.01741773 0.2259065 0.451506072 1.434655e-05 0.002268534
## cp 0.22753051 0.1167570 0.045325866 2.004029e-01 0.017012726
## trestbps 0.08814481 0.2993390 0.029325360 2.087120e-02 0.046508710
## chol 0.04486824 0.2136789 0.071507020 2.554925e-01 0.094933662
## fbs 0.01850458 0.1568261 0.248379474 3.179458e-02 0.026157199
# Contributions to the principal components
head(var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## age 9.5058889 15.980443 0.4845641 0.300264349 10.8095146
## sex 0.6258482 14.494497 37.5344940 0.001229356 0.2277834
## cp 8.1755530 7.491305 3.7680190 17.172515022 1.7082465
## trestbps 3.1671910 19.206040 2.4378688 1.788451900 4.6699359
## chol 1.6121913 13.709958 5.9445044 21.893144168 9.5322814
## fbs 0.6649006 10.062196 20.6482226 2.724476118 2.6264423
#The plot Below is also known as variable correlation plots. It shows the relationships between all variables. It can be interpreted as follow:
#Positively correlated variables are grouped together.
#Negatively correlated variables are positioned on opposite sides of the plot origin (opposed quadrants).
#The distance between variables and the origin measures the quality of the variables on the factor map.
#Variables that are away from the origin are well represented on the factor map.
# Correlation circle
fviz_pca_var(res.pca, col.var = "black")

# Quality of representation
corrplot(var$cos2, is.corr=FALSE)

# Total cos2 of variables on Dim.1 and Dim.2
#A high cos2 indicates a good representation of the variable on the principal component.
#In this case the variable is positioned close to the circumference of the correlation circle.
#A low cos2 indicates that the variable is not perfectly represented by the PCs.
#In this case the variable is close to the center of the circle.
fviz_cos2(res.pca, choice = "var", axes = 1:2)

fviz_pca_var(res.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)

# Change the transparency by cos2 values
fviz_pca_var(res.pca, alpha.var = "cos2")

corrplot(var$contrib, is.corr=FALSE)

# Contributions of variables to PC1
fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)

# Contributions of variables to PC2
fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)

fviz_pca_var(res.pca, col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

fviz_pca_var(res.pca, alpha.var = "contrib")

fviz_pca_ind(res.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = heart$target, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)

# Description of PC
res.desc <- dimdesc(res.pca, axes = c(1,2,3,4,5), proba = 0.05)
# Description of dimension 1
res.desc$Dim.1
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## oldpeak 0.7023260 3.521895e-153
## exang 0.6083286 8.959399e-105
## age 0.5143487 2.660905e-70
## ca 0.4410908 4.856320e-50
## thal 0.3665971 5.881322e-34
## trestbps 0.2968919 2.643038e-22
## chol 0.2118213 7.323225e-12
## fbs 0.1360315 1.241619e-05
## sex 0.1319762 2.248189e-05
## restecg -0.2151083 3.401365e-12
## cp -0.4770016 2.334808e-59
## slope -0.6327058 1.004959e-115
## thalach -0.6951050 8.537505e-149
res.desc$Dim.2
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## trestbps 0.54711885 4.316495e-81
## age 0.49906506 1.168945e-65
## chol 0.46225416 2.137136e-55
## fbs 0.39601272 8.008825e-40
## cp 0.34169727 1.906009e-29
## ca 0.13356227 1.785957e-05
## thalach 0.11386589 2.592693e-04
## slope 0.07643698 1.437421e-02
## oldpeak -0.08404940 7.094501e-03
## thal -0.23683979 1.555433e-14
## restecg -0.30497917 1.666111e-23
## exang -0.32276479 2.793006e-26
## sex -0.47529619 6.860180e-59
res.desc$Dim.3
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## sex 0.67194202 1.430340e-135
## fbs 0.49837684 1.867951e-65
## ca 0.34634137 2.946136e-30
## thal 0.28697180 6.967823e-21
## thalach 0.21802204 1.705632e-12
## cp 0.21289872 5.703323e-12
## trestbps 0.17124649 3.454787e-08
## slope 0.16016684 2.537697e-07
## age -0.07634702 1.448987e-02
## restecg -0.26380120 8.847331e-18
## chol -0.26740797 3.041939e-18
res.desc$Dim.4
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## slope 0.52474189 1.349658e-73
## chol 0.50546270 1.422337e-67
## thal 0.35745956 2.962931e-32
## ca 0.22956613 1.004412e-13
## exang 0.15192611 1.026625e-06
## thalach 0.09501493 2.325708e-03
## trestbps -0.14446867 3.417390e-06
## fbs -0.17831035 9.037288e-09
## restecg -0.19530646 2.862441e-10
## oldpeak -0.35893217 1.588975e-32
## cp -0.44766382 1.155834e-51
res.desc$Dim.5
##
## Link between the variable and the continuous variables (R-square)
## =================================================================================
## correlation p.value
## thal 0.32695877 5.805262e-27
## thalach 0.31981038 8.322597e-26
## chol 0.30811307 5.575289e-24
## trestbps 0.21565878 2.987761e-12
## oldpeak 0.21522758 3.307229e-12
## cp 0.13043284 2.805582e-05
## exang 0.09010505 3.887815e-03
## fbs -0.16173188 1.930194e-07
## slope -0.23206595 5.328331e-14
## age -0.32810646 3.760519e-27
## restecg -0.38207963 5.714017e-37
## ca -0.48777132 2.240638e-62
# Graph of Indiviuals
ind <- get_pca_ind(res.pca)
ind
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
## Principal Component Analysis Results for individuals
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the individuals"
## 2 "$cos2" "Cos2 for the individuals"
## 3 "$contrib" "contributions of the individuals"
#To get access to the different components, use this:
# Coordinates of individuals
head(ind$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 -0.5088278 -1.1281932 0.9623453 1.11441460 -0.8296152
## 2 2.6039019 -0.5451647 1.4705356 -1.52841708 1.6337808
## 3 3.0525119 -1.3222916 -0.4463478 -1.57977920 0.1247189
## 4 -0.4794296 -0.2946210 0.8184706 0.96132306 -0.7327194
## 5 2.1727510 1.9677780 -0.3847231 -0.25977074 -2.4420092
## 6 0.1703675 -0.1136465 -2.0357571 0.09181311 -0.3800811
# Quality of individuals
head(ind$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 0.033283820 0.163628337 0.119056592 0.1596559580 0.088479973
## 2 0.326485488 0.014311003 0.104127567 0.1124859801 0.128529324
## 3 0.482643036 0.090566128 0.010319486 0.1292718899 0.000805705
## 4 0.027891023 0.010532759 0.081286959 0.1121381539 0.065146339
## 5 0.221856574 0.181972015 0.006955829 0.0031712673 0.280250896
## 6 0.002674068 0.001189902 0.381813258 0.0007766198 0.013309186
# Contributions of individuals
head(ind$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## 1 0.009076016 0.0796741757 0.07511125 0.1038244485 0.067422708
## 2 0.237685600 0.0186039875 0.17538571 0.1952944105 0.261481127
## 3 0.326639234 0.1094473104 0.01615808 0.2086406040 0.001523763
## 4 0.008057557 0.0054334800 0.05433120 0.0772582331 0.052593018
## 5 0.165490673 0.2423833660 0.01200438 0.0056413949 0.584180915
## 6 0.001017483 0.0008084676 0.33612053 0.0007047181 0.014151589
fviz_pca_ind(res.pca)

fviz_pca_ind(res.pca, col.ind = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_pca_ind(res.pca, pointsize = "cos2",
pointshape = 21, fill = "#E7B800",
repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_pca_ind(res.pca, col.ind = "cos2", pointsize = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping (slow if many points)
)

fviz_cos2(res.pca, choice = "ind")

# Total contribution on PC1 and PC2
fviz_contrib(res.pca, choice = "ind", axes = 1:2)

# Create a random continuous variable of length 23,
# Same length as the number of active individuals in the PCA
set.seed(123)
my.cont.var <- rnorm(1025)
# Color individuals by the continuous variable
fviz_pca_ind(res.pca, col.ind = my.cont.var,
gradient.cols = c("blue", "yellow", "red"),
legend.title = "Cont.Var")

heart$target <- as.factor(heart$target)
fviz_pca_ind(res.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = heart$target, # color by groups
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)

fviz_pca_ind(res.pca, geom.ind = "point", col.ind = heart$target,
palette = c("#00AFBB", "#E7B800", "#FC4E07"),
addEllipses = TRUE, ellipse.type = "confidence",
legend.title = "Groups"
)

fviz_pca_ind(res.pca,
label = "none", # hide individual labels
habillage = heart$target, # color by groups
addEllipses = TRUE, # Concentration ellipses
palette = "jco"
)

fviz_pca_var(res.pca, geom.var = c("point", "text"))

# Show individuals text labels only
fviz_pca_ind(res.pca, geom.ind = "text")

# Change the size of arrows an labels
fviz_pca_var(res.pca, arrowsize = 1, labelsize = 5,
repel = TRUE)

# Change points size, shape and fill color
# Change labelsize
fviz_pca_ind(res.pca,
pointsize = 3, pointshape = 21, fill = "lightblue",
labelsize = 5, repel = TRUE)

fviz_pca_ind(res.pca,
geom.ind = "point", # show points only (but not "text")
group.ind = heart$target, # color by groups
legend.title = "Groups",
mean.point = FALSE)

fviz_pca_ind(res.pca,
geom.ind = "point", # show points only (but not "text")
group.ind = heart$target, # color by groups
legend.title = "Groups",
mean.point = TRUE)

fviz_pca_var(res.pca, axes.linetype = "blank")

ind.p <- fviz_pca_ind(res.pca, geom = "point", col.ind = heart$target)
ggpubr::ggpar(ind.p,
title = "Principal Component Analysis",
subtitle = "Heart disease data",
caption = "Source: factoextra",
xlab = "PC1", ylab = "PC2",
legend.title = "Disease or No Disease", legend.position = "top",
ggtheme = theme_gray(), palette = "jco"
)

fviz_pca_biplot(res.pca, repel = TRUE,col.ind = heart$target,
col.var = "#2E9FDF", # Variables color
)

fviz_pca_biplot(res.pca,
col.ind = heart$target, palette = "jco",
addEllipses = TRUE, label = "var",
col.var = "black", repel = TRUE,
legend.title = "Target")

fviz_pca_biplot(res.pca,
# Fill individuals by groups
geom.ind = "point",
pointshape = 21,
pointsize = 2.5,
fill.ind = heart$target,
col.ind = "black",
# Color variable by groups
legend.title = list(fill = "Target", color = "Clusters"),
repel = TRUE # Avoid label overplotting
)+
ggpubr::fill_palette("jco")+ # Indiviual fill color
ggpubr::color_palette("npg") # Variable colors

fviz_pca_biplot(res.pca,
# Individuals
geom.ind = "point",
fill.ind = heart$target, col.ind = "black",
pointshape = 21, pointsize = 2,
palette = "jco",
addEllipses = TRUE,
# Variables
alpha.var ="contrib", col.var = "contrib",
gradient.cols = "RdYlBu",
legend.title = list(fill = "Survivorship", color = "Contrib",
alpha = "Contrib")
)

# Conclusion
# From the above observations, we got the significant features which is correct:
# "sex", "cp", "restecg", "exang", "slope", "ca", "thal"
# 1- Males are more vulnerable to be diagnosed with heart disease than females.
#
# 2- Chest Pain is most common factor that leads to heart disease for males and females.
#
# 3- Maximum heart rate achieved is the highest cause factor to cause heart disease for females where is Thalassemia is the highest to cause heart disease for males.
#
# 4- There is a high association between chest pain and heart disease diagnosis.
# Limitation
# The dataset is missing some useful information such as smoking, obesity or family history
# that can help in predicting.
# The following conditions are associated with increased prevalence of heart disease:
# Asymptomatic angina chest pain (relative to typical angina chest pain, atypical angina pain, or non-angina pain)
# Presence of exercise induced angina
# Lower fasting blood sugar
# Flat or down-sloaping peak exercise ST segment
# Presence of left ventricle hypertrophy
# Male
# Higher thelassemia score
# Higher age
# Lower max heart rate achieved
# Higher resting blood pressure
# Higher cholesterol
# Higher ST depression induced by exercise relative to rest